I am trying to do a rolling sum of match by working with two tables:
DT1:
M | A1 | A2 |
---|---|---|
M01 | A | G |
M02 | G | A |
M03 | T | C |
Mnn | A | G |
DT2:
IND | Group | M01 | M02 | Mnn |
---|---|---|---|---|
I1 | 1 | A | G | G |
I2 | 1 | A | G | G |
I3 | 1 | G | A | A |
I4 | 2 | G | A | G |
In | 2 | G | A | G |
I being the n individual of the group 1 or 2 and with its information about n Markers.
The output is the sum of both Alleles for both group and for every n Markers.
##Code for replicability
#DT1
DT1<-data.table(M=c("M01","M02","M03","Mnn"),
A1= c("A","G","T","A"),
A2=c("G","A","C","G"))
#DT2
DT2<-data.table(IND=c("I1","I2","I3","I4","In"),
Group=c(1,1,1,2,2),
M01=c("A","A","A","G","G"),
M02=c("G","G","A","G","G"),
M03=c("C","C","C","T","C"),
Mnn=c("G","A","A","G","A"))
#M being the nn marker with its Allele1 and Allele2
#What I did found so far:
for (i in colnames(DT2)){
print(i)
DT1$A1G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A1[DT1$M==i])
DT1$A2G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A2[DT1$M==i])
DT1$A1G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A1[DT1$M==i])
DT1$A2G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A2[DT1$M==i])
}
#The output I want would be the sum of both A for the two group and for every Mnn.
# M A1 A2 A1G1 A2G1 A1G2 A2G2
#1: M01 A G 3 0 0 2
#2: M02 G A 2 1 2 0
#3: M03 T C 0 3 1 1
#4: Mnn A G 2 1 1 1
It does the job but I feel like data.table could do it in one line and with less computation time by avoiding looping as Mnn is up to 50k and In is up to 15k it takes a long time.
Anyone with solution would greatly help me as I have trouble working with data.table logic of key and indexes when working with two different tables.
CodePudding user response:
If you melt your two tables, and do a join on M
and value
, you can count by group, allele, and marker:
- pivot these tables long, and join
DT_long = melt(DT2,id = c("IND", "Group"),variable.name = "M")[melt(DT1, id="M",variable.name="allele"), on=.(M,value)]
- join
DT1
back on to a wide version of the sum over allele, group, and marker
DT1[dcast(
DT_long[,.N, .(col =paste0(allele,"G",Group),M)],
M~col,value.var="N",fill=0
), on="M"]
Output:
M A1 A2 A1G1 A1G2 A2G1 A2G2
1: M01 A G 3 0 0 2
2: M02 G A 2 2 1 0
3: M03 T C 0 1 3 1
4: Mnn A G 2 1 1 1
CodePudding user response:
We could make the loop a bit more efficient by using colSums
. Also, reduce the number of ==
by split
ting the 'DT2' by 'Group'
mcols <- grep("^M", names(DT2), value = TRUE)
lst1 <- split(DT2[, ..mcols], DT2$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)], na.rm = TRUE)]
DT1[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)], na.rm = TRUE)][]
}
-output
> DT1
M A1 A2 A1G1 A2G1 A1G2 A2G2
<char> <char> <char> <num> <num> <num> <num>
1: M01 A G 3 0 0 2
2: M02 G A 2 1 2 0
3: M03 T C 0 3 1 1
4: Mnn A G 2 1 1 1
Benchmarks
On a slightly bigger dataset, checked the timings with OP's method and this
# data
set.seed(24)
DT1test<-data.table(M=sprintf('Md', 1:5000),
A1= sample(c("A","G","T","C"), 5000, replace = TRUE),
A2=sample(c("G","A","T","C"), 5000, replace = TRUE))
DT1testold <- copy(DT1test)
set.seed(42)
m1 <- matrix(sample(c("A", "G", "T", "C"), 5000 * 15000,
replace = TRUE), ncol = 5000, dimnames = list(NULL, DT1test$M))
DT2test<-data.table(IND=paste0("I", 1:15000),
Group=rep(1:300, each = 50))
DT2test <- cbind(DT2test, m1)
timings - old method
system.time({
for (i in colnames(DT2test)){
for(j in unique(DT2test$Group)) {
DT1testold[[paste0("A1G", j)]][DT1testold$M==i] <-
sum(DT2testold[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
DT1testold[[paste0("A2G", j)]][DT1testold$M==i] <-
sum(DT2test[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
}
}
})
user system elapsed
502.603 106.631 610.908
timings-new method
system.time({
mcols <- grep("^M", names(DT2test), value = TRUE)
lst1 <- split(DT2test[, ..mcols], DT2test$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1test[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)],
na.rm = TRUE)]
DT1test[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)],
na.rm = TRUE)][]
}
})
#user system elapsed
#36.079 0.968 36.934