I have 2 dataframes:
df1 <- read.table(text ="
pos S01 S02 S03 S04 S05 S06
1 1 G G - A A A
2 2 T T - T T T
3 3 G G - G G G
4 4 C C A C C C
5 5 A A T A A A
6 6 A A G A A A
7 7 T T A T T T
8 9 C C T C C C
9 12 C C C C C C
10 15 T T T T T T", header = TRUE)
df2 <- read.table(text ="
pos ref var
1 1 G -
2 2 T -
3 3 G -
4 4 C A
5 5 A T
6 6 A G
7 7 T A
8 9 C T
9 12 C T
10 15 T C", header = TRUE)
So I want to compare each column of df1 (S01 to S06) with df2 using the ref column, I made a for loop:
df3 <- df1
for(i in 2:ncol(df1)){
df3[i] <- data.frame((ifelse(df1[i] == df2["ref"], 1, 0)))
}
df3
pos S01 S02 S03 S04 S05 S06
1 1 1 1 0 0 0 0
2 2 1 1 0 1 1 1
3 3 1 1 0 1 1 1
4 4 1 1 0 1 1 1
5 5 1 1 0 1 1 1
6 6 1 1 0 1 1 1
7 7 1 1 0 1 1 1
8 9 1 1 0 1 1 1
9 12 1 1 1 1 1 1
10 15 1 1 1 1 1 1
I have 2 problems, I need to make the same comparison, but using the column position (pos) in both data.frames, I mean, first compare the if df1$pos are equal to df2$pos, then make the loop to compare each column from S01 to S0n with df2$ref; and last question, I want to add and other option, add a 2 number in the loop if dash (-) are present, at the end will be something like:
pos S01 S02 S03 S04 S05 S06
1 1 1 1 2 0 0 0
2 2 1 1 2 1 1 1
3 3 1 1 2 1 1 1
4 4 1 1 0 1 1 1
5 5 1 1 0 1 1 1
6 6 1 1 0 1 1 1
7 7 1 1 0 1 1 1
8 9 1 1 0 1 1 1
9 12 1 1 1 1 1 1
10 15 1 1 1 1 1 1
or avoid to replace the "-"
pos S01 S02 S03 S04 S05 S06
1 1 1 1 - 0 0 0
2 2 1 1 - 1 1 1
3 3 1 1 - 1 1 1
4 4 1 1 0 1 1 1
5 5 1 1 0 1 1 1
6 6 1 1 0 1 1 1
7 7 1 1 0 1 1 1
8 9 1 1 0 1 1 1
9 12 1 1 1 1 1 1
10 15 1 1 1 1 1 1
CodePudding user response:
First merge on variant ids - pos. Then check if genotype for each sample (column) is "-", else check if the genotype matches the reference:
out <- merge(df1, df2[, c("pos", "ref")])
samples <- colnames(df1)[2:ncol(df1)]
out <- cbind(
out[ "pos" ],
ifelse(out[, samples] == "-", 2, out[, samples] == out$ref))
out
# pos S01 S02 S03 S04 S05 S06
# 1 1 1 1 2 0 0 0
# 2 2 1 1 2 1 1 1
# 3 3 1 1 2 1 1 1
# 4 4 1 1 0 1 1 1
# 5 5 1 1 0 1 1 1
# 6 6 1 1 0 1 1 1
# 7 7 1 1 0 1 1 1
# 8 9 1 1 0 1 1 1
# 9 12 1 1 1 1 1 1
# 10 15 1 1 1 1 1 1
CodePudding user response:
tidyverse
library(tidyverse)
df1 <- data.frame(
stringsAsFactors = FALSE,
pos = c(1L,2L,3L,4L,5L,6L,7L,9L, 12L,15L),
S01 = c("G","T","G","C","A","A","T","C","C","T"),
S02 = c("G","T","G","C","A","A","T","C","C","T"),
S03 = c("-","-","-","A","T","G","A","T","C","T"),
S04 = c("A","T","G","C","A","A","T","C","C","T"),
S05 = c("A","T","G","C","A","A","T","C","C","T"),
S06 = c("A","T","G","C","A","A","T","C","C","T")
)
df2 <- data.frame(
stringsAsFactors = FALSE,
pos = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 12L, 15L),
ref = c("G", "T", "G", "C", "A", "A", "T", "C", "C", "T"),
var = c("-", "-", "-", "A", "T","G", "A", "T", "T", "C")
)
df1[df1 == "-"] <- NA
map_df(df1, ~ (.x == df2$ref)) %>%
mutate(across(everything(), ~replace_na(.x, 2)))
#> # A tibble: 10 x 7
#> pos S01 S02 S03 S04 S05 S06
#> <int> <int> <int> <int> <int> <int> <int>
#> 1 0 1 1 2 0 0 0
#> 2 0 1 1 2 1 1 1
#> 3 0 1 1 2 1 1 1
#> 4 0 1 1 0 1 1 1
#> 5 0 1 1 0 1 1 1
#> 6 0 1 1 0 1 1 1
#> 7 0 1 1 0 1 1 1
#> 8 0 1 1 0 1 1 1
#> 9 0 1 1 1 1 1 1
#> 10 0 1 1 1 1 1 1
base
df1[df1 == "-"] <- NA
df1[] <- sapply(df1, function(x) (x == df2$ref))
df1[is.na(df1)] <- 2
df1
#> pos S01 S02 S03 S04 S05 S06
#> 1 0 1 1 2 0 0 0
#> 2 0 1 1 2 1 1 1
#> 3 0 1 1 2 1 1 1
#> 4 0 1 1 0 1 1 1
#> 5 0 1 1 0 1 1 1
#> 6 0 1 1 0 1 1 1
#> 7 0 1 1 0 1 1 1
#> 8 0 1 1 0 1 1 1
#> 9 0 1 1 1 1 1 1
#> 10 0 1 1 1 1 1 1
Created on 2022-10-07 with reprex v2.0.2
CodePudding user response:
A way without ifelse
, therefore perhaps faster. It also works with matrix
es.
f <- \(x, y) {
pos <- match(x[, 'pos'], y[, 'pos']) ## in x find pos incl. in y
cc <- setdiff(colnames(x), 'pos') ## only compare non-pos
out <- x ## copy x for output
out[pos, cc] <- as.numeric(x[pos, cc] == y[pos, 'ref']) ## find matches
out[x == '-'] <- '-' ## replace original `-` with `-`
return(out[pos, ])
}
f(df1, df2)
# pos S01 S02 S03 S04 S05 S06
# 1 1 1 1 - 0 0 0
# 2 2 1 1 - 1 1 1
# 3 3 1 1 - 1 1 1
# 4 4 1 1 0 1 1 1
# 5 5 1 1 0 1 1 1
# 6 6 1 1 0 1 1 1
# 7 7 1 1 0 1 1 1
# 8 9 1 1 0 1 1 1
# 9 12 1 1 1 1 1 1
# 10 15 1 1 1 1 1 1