Home > Software engineering >  How to compare columns using ifelse with 3 or more options?
How to compare columns using ifelse with 3 or more options?

Time:10-07

I have 2 dataframes:

df1 <- read.table(text ="
pos S01 S02 S03 S04 S05 S06
1    1   G   G   -   A   A   A
2    2   T   T   -   T   T   T
3    3   G   G   -   G   G   G
4    4   C   C   A   C   C   C
5    5   A   A   T   A   A   A
6    6   A   A   G   A   A   A
7    7   T   T   A   T   T   T
8    9   C   C   T   C   C   C
9   12   C   C   C   C   C   C
10  15   T   T   T   T   T   T", header = TRUE)

df2 <- read.table(text ="
pos ref var
1    1   G   -
2    2   T   -
3    3   G   -
4    4   C   A
5    5   A   T
6    6   A   G
7    7   T   A
8    9   C   T
9   12   C   T
10  15   T   C", header = TRUE)

So I want to compare each column of df1 (S01 to S06) with df2 using the ref column, I made a for loop:

df3 <- df1
for(i in 2:ncol(df1)){
    df3[i] <- data.frame((ifelse(df1[i] == df2["ref"], 1, 0)))
}

df3
   pos S01 S02 S03 S04 S05 S06
1    1   1   1   0   0   0   0
2    2   1   1   0   1   1   1
3    3   1   1   0   1   1   1
4    4   1   1   0   1   1   1
5    5   1   1   0   1   1   1
6    6   1   1   0   1   1   1
7    7   1   1   0   1   1   1
8    9   1   1   0   1   1   1
9   12   1   1   1   1   1   1
10  15   1   1   1   1   1   1

I have 2 problems, I need to make the same comparison, but using the column position (pos) in both data.frames, I mean, first compare the if df1$pos are equal to df2$pos, then make the loop to compare each column from S01 to S0n with df2$ref; and last question, I want to add and other option, add a 2 number in the loop if dash (-) are present, at the end will be something like:

   pos S01 S02 S03 S04 S05 S06
1    1   1   1   2   0   0   0
2    2   1   1   2   1   1   1
3    3   1   1   2   1   1   1
4    4   1   1   0   1   1   1
5    5   1   1   0   1   1   1
6    6   1   1   0   1   1   1
7    7   1   1   0   1   1   1
8    9   1   1   0   1   1   1
9   12   1   1   1   1   1   1
10  15   1   1   1   1   1   1

or avoid to replace the "-"

   pos S01 S02 S03 S04 S05 S06
1    1   1   1   -   0   0   0
2    2   1   1   -   1   1   1
3    3   1   1   -   1   1   1
4    4   1   1   0   1   1   1
5    5   1   1   0   1   1   1
6    6   1   1   0   1   1   1
7    7   1   1   0   1   1   1
8    9   1   1   0   1   1   1
9   12   1   1   1   1   1   1
10  15   1   1   1   1   1   1

CodePudding user response:

First merge on variant ids - pos. Then check if genotype for each sample (column) is "-", else check if the genotype matches the reference:

out <- merge(df1, df2[, c("pos", "ref")])
samples <- colnames(df1)[2:ncol(df1)]

out <- cbind(
  out[ "pos" ],
  ifelse(out[, samples] == "-", 2, out[, samples] == out$ref))

out
#    pos S01 S02 S03 S04 S05 S06
# 1    1   1   1   2   0   0   0
# 2    2   1   1   2   1   1   1
# 3    3   1   1   2   1   1   1
# 4    4   1   1   0   1   1   1
# 5    5   1   1   0   1   1   1
# 6    6   1   1   0   1   1   1
# 7    7   1   1   0   1   1   1
# 8    9   1   1   0   1   1   1
# 9   12   1   1   1   1   1   1
# 10  15   1   1   1   1   1   1

CodePudding user response:

tidyverse

library(tidyverse)
df1 <- data.frame(
  stringsAsFactors = FALSE,
  pos = c(1L,2L,3L,4L,5L,6L,7L,9L, 12L,15L),
  S01 = c("G","T","G","C","A","A","T","C","C","T"),
  S02 = c("G","T","G","C","A","A","T","C","C","T"),
  S03 = c("-","-","-","A","T","G","A","T","C","T"),
  S04 = c("A","T","G","C","A","A","T","C","C","T"),
  S05 = c("A","T","G","C","A","A","T","C","C","T"),
  S06 = c("A","T","G","C","A","A","T","C","C","T")
  )

df2 <- data.frame(
  stringsAsFactors = FALSE,
  pos = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 12L, 15L),
  ref = c("G", "T", "G", "C", "A", "A", "T", "C", "C", "T"),
  var = c("-", "-", "-", "A", "T","G", "A", "T", "T", "C")
)


df1[df1 == "-"] <- NA
map_df(df1, ~ (.x == df2$ref)) %>% 
  mutate(across(everything(), ~replace_na(.x, 2)))
#> # A tibble: 10 x 7
#>      pos   S01   S02   S03   S04   S05   S06
#>    <int> <int> <int> <int> <int> <int> <int>
#>  1     0     1     1     2     0     0     0
#>  2     0     1     1     2     1     1     1
#>  3     0     1     1     2     1     1     1
#>  4     0     1     1     0     1     1     1
#>  5     0     1     1     0     1     1     1
#>  6     0     1     1     0     1     1     1
#>  7     0     1     1     0     1     1     1
#>  8     0     1     1     0     1     1     1
#>  9     0     1     1     1     1     1     1
#> 10     0     1     1     1     1     1     1

base

df1[df1 == "-"] <- NA
df1[] <- sapply(df1, function(x)  (x == df2$ref))
df1[is.na(df1)] <- 2
df1
#>    pos S01 S02 S03 S04 S05 S06
#> 1    0   1   1   2   0   0   0
#> 2    0   1   1   2   1   1   1
#> 3    0   1   1   2   1   1   1
#> 4    0   1   1   0   1   1   1
#> 5    0   1   1   0   1   1   1
#> 6    0   1   1   0   1   1   1
#> 7    0   1   1   0   1   1   1
#> 8    0   1   1   0   1   1   1
#> 9    0   1   1   1   1   1   1
#> 10   0   1   1   1   1   1   1

Created on 2022-10-07 with reprex v2.0.2

CodePudding user response:

A way without ifelse, therefore perhaps faster. It also works with matrixes.

f <- \(x, y) {
  pos <- match(x[, 'pos'], y[, 'pos'])  ## in x find pos incl. in y
  cc <- setdiff(colnames(x), 'pos')  ## only compare non-pos
  out <- x  ## copy x for output
  out[pos, cc] <- as.numeric(x[pos, cc] == y[pos, 'ref'])  ## find matches
  out[x == '-'] <- '-'  ## replace original `-` with `-`
  return(out[pos, ])
}

f(df1, df2)
#    pos S01 S02 S03 S04 S05 S06
# 1    1   1   1   -   0   0   0
# 2    2   1   1   -   1   1   1
# 3    3   1   1   -   1   1   1
# 4    4   1   1   0   1   1   1
# 5    5   1   1   0   1   1   1
# 6    6   1   1   0   1   1   1
# 7    7   1   1   0   1   1   1
# 8    9   1   1   0   1   1   1
# 9   12   1   1   1   1   1   1
# 10  15   1   1   1   1   1   1  
  • Related