Home > OS >  R data.table: Count number of match for multiple strings for two group between two DT
R data.table: Count number of match for multiple strings for two group between two DT

Time:09-07

I am trying to do a rolling sum of match by working with two tables:
DT1:

M A1 A2
M01 A G
M02 G A
M03 T C
Mnn A G

DT2:

IND Group M01 M02 Mnn
I1 1 A G G
I2 1 A G G
I3 1 G A A
I4 2 G A G
In 2 G A G

I being the n individual of the group 1 or 2 and with its information about n Markers.

The output is the sum of both Alleles for both group and for every n Markers.

##Code for replicability
#DT1
DT1<-data.table(M=c("M01","M02","M03","Mnn"),
                A1= c("A","G","T","A"),
                A2=c("G","A","C","G"))
#DT2
DT2<-data.table(IND=c("I1","I2","I3","I4","In"),
                Group=c(1,1,1,2,2),
                M01=c("A","A","A","G","G"),
                M02=c("G","G","A","G","G"),
                M03=c("C","C","C","T","C"),
                Mnn=c("G","A","A","G","A"))
#M being the nn marker with its Allele1 and Allele2  

#What I did found so far: 

for (i in colnames(DT2)){
  print(i)
  DT1$A1G1[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==1] == DT1$A1[DT1$M==i])    
  DT1$A2G1[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==1] == DT1$A2[DT1$M==i])    
  DT1$A1G2[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==2] == DT1$A1[DT1$M==i])    
  DT1$A2G2[DT1$M==i]<-  sum(DT2[[i]][DT2$Group==2] == DT1$A2[DT1$M==i]) 
}

#The output I want would be the sum of both A for the two group and for every Mnn.

#     M A1 A2 A1G1 A2G1 A1G2 A2G2
#1: M01  A  G    3    0    0    2
#2: M02  G  A    2    1    2    0
#3: M03  T  C    0    3    1    1
#4: Mnn  A  G    2    1    1    1

It does the job but I feel like data.table could do it in one line and with less computation time by avoiding looping as Mnn is up to 50k and In is up to 15k it takes a long time.

Anyone with solution would greatly help me as I have trouble working with data.table logic of key and indexes when working with two different tables.

CodePudding user response:

If you melt your two tables, and do a join on M and value, you can count by group, allele, and marker:

  1. pivot these tables long, and join
DT_long = melt(DT2,id = c("IND", "Group"),variable.name = "M")[melt(DT1, id="M",variable.name="allele"), on=.(M,value)]
  1. join DT1 back on to a wide version of the sum over allele, group, and marker
DT1[dcast(
  DT_long[,.N, .(col =paste0(allele,"G",Group),M)],
  M~col,value.var="N",fill=0
), on="M"]

Output:

     M A1 A2 A1G1 A1G2 A2G1 A2G2
1: M01  A  G    3    0    0    2
2: M02  G  A    2    2    1    0
3: M03  T  C    0    1    3    1
4: Mnn  A  G    2    1    1    1

CodePudding user response:

We could make the loop a bit more efficient by using colSums. Also, reduce the number of == by splitting the 'DT2' by 'Group'

mcols <- grep("^M", names(DT2), value = TRUE)
lst1 <- split(DT2[, ..mcols], DT2$Group)
for(i in seq_along(lst1)) {
      tmp <- lst1[[i]]
      DT1[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)],  na.rm = TRUE)]
       DT1[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)],  na.rm = TRUE)][]

}

-output

> DT1
        M     A1     A2  A1G1  A2G1  A1G2  A2G2
   <char> <char> <char> <num> <num> <num> <num>
1:    M01      A      G     3     0     0     2
2:    M02      G      A     2     1     2     0
3:    M03      T      C     0     3     1     1
4:    Mnn      A      G     2     1     1     1

Benchmarks

On a slightly bigger dataset, checked the timings with OP's method and this

# data
set.seed(24)
DT1test<-data.table(M=sprintf('Md', 1:5000),
                A1= sample(c("A","G","T","C"), 5000, replace = TRUE),
                A2=sample(c("G","A","T","C"), 5000, replace = TRUE))

DT1testold <- copy(DT1test)
set.seed(42)
m1 <- matrix(sample(c("A", "G", "T", "C"), 5000 * 15000, 
    replace = TRUE), ncol = 5000, dimnames = list(NULL, DT1test$M))
DT2test<-data.table(IND=paste0("I", 1:15000),
                            Group=rep(1:300, each = 50))    
DT2test <- cbind(DT2test, m1)

timings - old method

system.time({
for (i in colnames(DT2test)){
   for(j in unique(DT2test$Group)) {
   
     DT1testold[[paste0("A1G", j)]][DT1testold$M==i] <- 
     sum(DT2testold[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
      DT1testold[[paste0("A2G", j)]][DT1testold$M==i] <- 
    sum(DT2test[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])  
   
  }
  }
  
  
  
  })
   user  system elapsed 
502.603 106.631 610.908 

timings-new method

system.time({
 mcols <- grep("^M", names(DT2test), value = TRUE)
 lst1 <- split(DT2test[, ..mcols], DT2test$Group)
 for(i in seq_along(lst1)) {
       tmp <- lst1[[i]]
       DT1test[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)], 
        na.rm = TRUE)]
        DT1test[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)], 
       na.rm = TRUE)][]

 }
 
 
 
 })
 #user  system elapsed 
 #36.079   0.968  36.934 
  • Related