I have the following dataframes:
gene = c("a","b","c","d","e","f","g","h","i","j","k", "a","b","c","d","e","f","g","h","i","j","k", "a","b","c","d","e","f","g","h","i","j","k")
sample1 = c("a","a","a","a","a","a","a","a","a","a", "a","b","b","b","b","b","b","b","b","b","b","b","c","c","c","c","c","c","c","c","c","c","c")
expression1 = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24","25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "36")
data.frame(gene, sample1, expression1)
gene sample1 expression1
1 a a 1
2 b a 2
3 c a 3
4 d a 4
5 e a 5
6 f a 6
7 g a 7
8 h a 8
9 i a 9
10 j a 10
11 k a 11
12 a b 14
13 b b 15
14 c b 16
15 d b 17
16 e b 18
17 f b 19
18 g b 20
19 h b 21
20 i b 22
21 j b 23
22 k b 24
23 a c 25
24 b c 26
25 c c 27
26 d c 28
27 e c 29
28 f c 30
29 g c 31
30 h c 32
31 i c 33
32 j c 34
33 k c 36
and a dataframe with a specific sample.
gene = c("a","b","c","d","e","f","g","h","i","j","k")
sample2 = c("g","g","g","g","g","g","g","g","g","g","g")
expression2 = c("14.7", "15", "17", "16", "18", "20", "21", "22", "23", "24", "25")
gene sample2 expression2
<chr> <chr> <chr>
1 a g 14.7
2 b g 15
3 c g 17
4 d g 16
5 e g 18
6 f g 20
7 g g 21
8 h g 22
9 i g 23
10 j g 24
11 k g 25
what I want to do is test the correlation between the sample in df2 against all the different samples in df1
I thought that would go like this:
test <- left_join(x=df2, y=df1, by="gene") %>%
group_by(sample1, sample2) %>%
summarize(cor=cor(expression1, expression2))
Is this the way you would also approach this. Or would you do this differently?
CodePudding user response:
Using by
.
by(df_all$expression, df_all$sample, \(x) cor(x, df_1$expression2))
# df_all$sample: a
# [1] 0.9873401
# ---------------------------------------------------------
# df_all$sample: b
# [1] 0.9873401
# ---------------------------------------------------------
# df_all$sample: c
# [1] 0.9838774
Or looping via sapply
.
with(df_all, sapply(unique(sample), \(x) cor(df_all[sample == x, ]$expression, df_1$expression2)))
# a b c
# 0.9873401 0.9873401 0.9838774
Or maybe a correlation matrix.
cn <- df_all$sample[!duplicated(df_all$sample)]
rn <- df_all$gene[!duplicated(df_all$gene)]
m <- cbind(matrix(df_all$expression, ncol=length(cn), dimnames=list(rn, cn)), g=df_1$expression2)
cor(m)
# a b c g
# a 1.0000000 1.0000000 0.9971765 0.9873401
# b 1.0000000 1.0000000 0.9971765 0.9873401
# c 0.9971765 0.9971765 1.0000000 0.9838774
# g 0.9873401 0.9873401 0.9838774 1.0000000
Edit
You could wrap this in a function that also cares for gene order and matching genes.
scorr <- function(x, y) {
## order
x <- x[with(x, order(sample, gene)), ]
y <- y[with(y, order(sample, gene)), ]
## match genes
mg <- intersect(x$gene, y$gene)
x <- x[x$gene %in% mg, ]
y <- y[y$gene %in% mg, ]
cn <- unique(x$sample)
rn <- unique(x$gene)
m <- with(x, cbind(matrix(expression, ncol=length(cn), dimnames=list(rn, cn)), g=y$expression2))
cor(m)
}
scorr(df_all, df_1)
# a b c g
# a 1.0000000 1.0000000 0.9971765 0.9873401
# b 1.0000000 1.0000000 0.9971765 0.9873401
# c 0.9971765 0.9971765 1.0000000 0.9838774
# g 0.9873401 0.9873401 0.9838774 1.0000000
scorr(df_all, df_2)
# a b c g
# a 1.0000000 1.0000000 0.9966446 0.9859516
# b 1.0000000 1.0000000 0.9966446 0.9859516
# c 0.9966446 0.9966446 1.0000000 0.9818871
# g 0.9859516 0.9859516 0.9818871 1.0000000
Data:
df_all <- structure(list(gene = c("a", "b", "c", "d", "e", "f", "g", "h",
"i", "j", "k", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j",
"k", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"),
sample = c("a", "a", "a", "a", "a", "a", "a", "a", "a", "a",
"a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b",
"c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"), expression = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 36L)), row.names = c(NA, -33L
), class = "data.frame")
df_1 <- structure(list(gene = c("a", "b", "c", "d", "e", "f", "g", "h",
"i", "j", "k"), sample = c("g", "g", "g", "g", "g", "g", "g",
"g", "g", "g", "g"), expression2 = c(14.7, 15, 17, 16, 18, 20,
21, 22, 23, 24, 25)), row.names = c(NA, -11L), class = "data.frame")
df_2 <- df_1[-c(2, 5), ]