I have two data.frames
(code below). The row ids in the data.frames
match.
set.seed(12345)
df1 <- data.frame(id=c(letters),
g1=c(rep(0,7), rep(1,18), "NA"),
g2=c(rep("NA", 3), rep(0,23)),
g3=c(rep(0,5), rep("NA",20),1),
g4=c(sample(c(0,1), replace=TRUE, size=26)),
g5=c(sample(c(0,1), replace=TRUE, size=20), "NA", sample(c(0,1), replace=TRUE, size=5)),
g6=c(rep(1,26)),
g7=c(sample(c(0,1), replace=TRUE, size=26)),
g8=c(rep(0,5), rep("NA",21)),
g9=c(rep("NA",26)),
g10=c(rep(0,26)))
df1[,2:11] <- sapply(df1[,2:11], as.numeric)
df2 <- data.frame(id=c(letters),
b1=c(runif(7, 3, 7.8), "NA", runif(18, 6, 18)),
b2=c(runif(7, -1, 4), "NA", runif(18, 0, 5)),
b3=c(runif(20, 0, 16), "NA", runif(5, -1, 2)),
b4=c(runif(7, 5, 29), rep("NA", 3), runif(3, -1, 2)),
b5=c(runif(3, 3, 8), rep("NA",23)),
b6=c(rep("NA",21), runif(5, 0, 19)),
b7=c(rep("NA",26)),
b8=c(runif(26, 1, 9)),
b9=c(runif(7, -1, 4), "NA", runif(18, -1, 4)),
b10=c(runif(6, -1, 4), rep("NA", 2), runif(18, 0, 5)),
b11=c(runif(7, 18, 23), "NA", runif(18, 12, 19)),
b12=c(runif(7, 0, 4), "NA", runif(18, 0, 4)),
b13=c(runif(7, 1, 4), "NA", runif(14, -2, 18), rep("NA", 4)),
b14=c(runif(6, 6, 8), rep("NA", 3), runif(17, 0, 5)),
b15=c(runif(7, 11, 12), "NA", runif(18, -1, 5)),
b16=c(runif(7, 3, 4), "NA", runif(18, 12, 21)),
b17=c(rep("NA", 8), runif(16, 1, 8), rep("NA", 2)))
df2[,2:18] <- sapply(df2[,2:18], as.numeric)
I would like to use t tests to test if for each "g" column in df1
, the 0 and 1 groups have significantly different values in df2
.
For example, for b1:
t.test(df2$b1[1:8], df2$b1[9:26])$p.val
#[1] 3.846501e-07
I would like to create a new df3
with the results, which looks like this:
df3 <- data.frame(g=rep("g1", 3),
b=c("b1", "b2", "b3"),
mean_0=c(mean(na.omit(df2$b1[1:8])),0,0),
mean_1=c(mean(na.omit(df2$b1[9:26])),0,0),
p.val=c(t.test(df2$b1[1:8], df2$b1[9:26])$p.val,1,1),
p.adjust=c(0,1,1))
df3 <- df3[order(df3$p.val),]
I do not know how to code this difficult problem. Can anyone help?
CodePudding user response:
One potential solution is to use purrr::map2()
to iterate over the different combinations of "g" and "b" columns, e.g.
library(tidyverse)
library(broom)
set.seed(12345)
df1 <- data.frame(id=c(letters),
g1=c(rep(0,7), rep(1,18), "NA"),
g2=c(rep("NA", 3), rep(0,23)),
g3=c(rep(0,5), rep("NA",20),1),
g4=c(sample(c(0,1), replace=TRUE, size=26)),
g5=c(sample(c(0,1), replace=TRUE, size=20), "NA", sample(c(0,1), replace=TRUE, size=5)),
g6=c(rep(1,26)),
g7=c(sample(c(0,1), replace=TRUE, size=26)),
g8=c(rep(0,5), rep("NA",21)),
g9=c(rep("NA",26)),
g10=c(rep(0,26)))
df1[,2:11] <- sapply(df1[,2:11], as.numeric)
df2 <- data.frame(id=c(letters),
b1=c(runif(7, 3, 7.8), "NA", runif(18, 6, 18)),
b2=c(runif(7, -1, 4), "NA", runif(18, 0, 5)),
b3=c(runif(20, 0, 16), "NA", runif(5, -1, 2)),
b4=c(runif(7, 5, 29), rep("NA", 3), runif(3, -1, 2)),
b5=c(runif(3, 3, 8), rep("NA",23)),
b6=c(rep("NA",21), runif(5, 0, 19)),
b7=c(rep("NA",26)),
b8=c(runif(26, 1, 9)),
b9=c(runif(7, -1, 4), "NA", runif(18, -1, 4)),
b10=c(runif(6, -1, 4), rep("NA", 2), runif(18, 0, 5)),
b11=c(runif(7, 18, 23), "NA", runif(18, 12, 19)),
b12=c(runif(7, 0, 4), "NA", runif(18, 0, 4)),
b13=c(runif(7, 1, 4), "NA", runif(14, -2, 18), rep("NA", 4)),
b14=c(runif(6, 6, 8), rep("NA", 3), runif(17, 0, 5)),
b15=c(runif(7, 11, 12), "NA", runif(18, -1, 5)),
b16=c(runif(7, 3, 4), "NA", runif(18, 12, 21)),
b17=c(rep("NA", 8), runif(16, 1, 8), rep("NA", 2)))
df2[,2:18] <- sapply(df2[,2:18], as.numeric)
# In your example you use "t.test(df2$b1[1:8], df2$b1[9:26])$p.val"
# but these columns don't line up with the values in df1;
# you should instead use "t.test(df2$b1[1:7], df2$b1[8:25])$p.val", e.g.
t.test(df2$b1[1:7], df2$b1[8:25])$p.val
#> [1] 8.505498e-07
t.test(df2[df1$g1 == 0,]$b1, df2[df1$g1 == 1,]$b1)$p.val
#> [1] 8.505498e-07
df3 <- data.frame(g=rep("g1", 3),
b=c("b1", "b2", "b3"),
mean_0=c(mean(na.omit(df2$b1[1:8])),0,0),
mean_1=c(mean(na.omit(df2$b1[9:26])),0,0),
p.val=c(t.test(df2$b1[1:8], df2$b1[9:26])$p.val,1,1),
p.adjust=c(0,1,1))
df3 <- df3[order(df3$p.val),]
df3
#> g b mean_0 mean_1 p.val p.adjust
#> 1 g1 b1 4.870685 12.10767 3.846501e-07 0
#> 2 g1 b2 0.000000 0.00000 1.000000e 00 1
#> 3 g1 b3 0.000000 0.00000 1.000000e 00 1
df4 <- map2(.x = rep(c("g1", "g4", "g5", "g7"), each = 4),
.y = rep(c("b1", "b2", "b3", "b4"), times = 4),
.f = ~tidy(t.test(df2[df1[[.x]] == 0,][[.y]],
df2[df1[[.x]] == 1,][[.y]])) %>%
mutate(g = .x,
b = .y) %>%
select(g,
b,
"mean_0" = estimate1,
"mean_1" = estimate2,
p.value))
result <- bind_rows(df4)
result
#> # A tibble: 16 × 5
#> g b mean_0 mean_1 p.value
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 g1 b1 4.87 12.0 0.000000851
#> 2 g1 b2 2.41 2.74 0.584
#> 3 g1 b3 7.82 7.53 0.907
#> 4 g1 b4 19.1 11.5 0.0611
#> 5 g4 b1 10.7 9.85 0.683
#> 6 g4 b2 2.59 2.78 0.798
#> 7 g4 b3 7.84 7.11 0.782
#> 8 g4 b4 12.8 13.9 0.822
#> 9 g5 b1 10.1 9.81 0.883
#> 10 g5 b2 2.48 2.81 0.614
#> 11 g5 b3 8.05 7.06 0.743
#> 12 g5 b4 10.8 14.6 0.471
#> 13 g7 b1 9.52 11.3 0.325
#> 14 g7 b2 2.57 3.04 0.396
#> 15 g7 b3 8.20 5.51 0.318
#> 16 g7 b4 16.5 8.31 0.0923
Created on 2022-08-09 by the reprex package (v2.0.1)
NB. This only works for columns where the t.test()
doesn't return an error. In this example I 'handpicked' the "g" and "b" columns that aren't all zeros / all ones / all NAs so the example would complete, so you may need to alter your approach for your actual data.
CodePudding user response:
Here is my attempt with a traditional for loops:
#preallocate the result matrix
resultlength <- (ncol(df1)-1) * (ncol(df2)-1)
results <- data.frame(b=vector(mode= "integer", length=resultlength), g=vector(mode= "integer", length=resultlength), p = vector(mode= "numeric", length=resultlength))
#index
i=0
for (g in (2:ncol(df1)) ) {
gindex <- g -1
independent <- df1[, g]
for(col in (2:ncol(df2)) ) {
b=col-1
tdf <- data.frame(dependent=df2[, col], independent )
tdf <- tdf[complete.cases(tdf), ]
# Check to ensure at least a 0 & 1 is present
if (length(unique(tdf$independent)) == 2) {
tresult <- tryCatch( t.test(tdf$dependent ~ tdf$independent)$p.value, error = function(e) 1.2)
}
else
{
tresult <-1.1 #error for lacking 2 groups
}
i <- i 1
results[i,] <- c(b, gindex, tresult)
}
}
This does have some error checking and correction. Right now it is failing if there is only a single value in one of the groups.