Correlation tests against big dataset-CodePudding

I have the following dataframes:

gene = c("a","b","c","d","e","f","g","h","i","j","k", "a","b","c","d","e","f","g","h","i","j","k", "a","b","c","d","e","f","g","h","i","j","k")
sample1 = c("a","a","a","a","a","a","a","a","a","a", "a","b","b","b","b","b","b","b","b","b","b","b","c","c","c","c","c","c","c","c","c","c","c")
expression1 = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24","25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "36")

data.frame(gene, sample1, expression1)

   gene sample1 expression1
1     a       a           1
2     b       a           2
3     c       a           3
4     d       a           4
5     e       a           5
6     f       a           6
7     g       a           7
8     h       a           8
9     i       a           9
10    j       a          10
11    k       a          11
12    a       b          14
13    b       b          15
14    c       b          16
15    d       b          17
16    e       b          18
17    f       b          19
18    g       b          20
19    h       b          21
20    i       b          22
21    j       b          23
22    k       b          24
23    a       c          25
24    b       c          26
25    c       c          27
26    d       c          28
27    e       c          29
28    f       c          30
29    g       c          31
30    h       c          32
31    i       c          33
32    j       c          34
33    k       c          36

and a dataframe with a specific sample.

gene = c("a","b","c","d","e","f","g","h","i","j","k")
sample2 = c("g","g","g","g","g","g","g","g","g","g","g")
expression2 = c("14.7", "15", "17", "16", "18", "20", "21", "22", "23", "24", "25")
   gene  sample2 expression2
   <chr> <chr>   <chr>
 1 a     g       14.7
 2 b     g       15
 3 c     g       17
 4 d     g       16
 5 e     g       18
 6 f     g       20
 7 g     g       21
 8 h     g       22
 9 i     g       23
10 j     g       24
11 k     g       25

what I want to do is test the correlation between the sample in df2 against all the different samples in df1

I thought that would go like this:

test <- left_join(x=df2, y=df1, by="gene") %>%
  group_by(sample1, sample2) %>%
  summarize(cor=cor(expression1, expression2))

Is this the way you would also approach this. Or would you do this differently?

CodePudding user response：

Using by.

by(df_all$expression, df_all$sample, \(x) cor(x, df_1$expression2))
# df_all$sample: a
# [1] 0.9873401
# --------------------------------------------------------- 
# df_all$sample: b
# [1] 0.9873401
# --------------------------------------------------------- 
# df_all$sample: c
# [1] 0.9838774

Or looping via sapply.

with(df_all, sapply(unique(sample), \(x) cor(df_all[sample == x, ]$expression, df_1$expression2)))
#         a         b         c 
# 0.9873401 0.9873401 0.9838774

Or maybe a correlation matrix.

cn <- df_all$sample[!duplicated(df_all$sample)]
rn <- df_all$gene[!duplicated(df_all$gene)]

m <- cbind(matrix(df_all$expression, ncol=length(cn), dimnames=list(rn, cn)), g=df_1$expression2)
cor(m)
#           a         b         c         g
# a 1.0000000 1.0000000 0.9971765 0.9873401
# b 1.0000000 1.0000000 0.9971765 0.9873401
# c 0.9971765 0.9971765 1.0000000 0.9838774
# g 0.9873401 0.9873401 0.9838774 1.0000000

Edit

You could wrap this in a function that also cares for gene order and matching genes.

scorr <- function(x, y) {
  ## order
  x <- x[with(x, order(sample, gene)), ]
  y <- y[with(y, order(sample, gene)), ]
  ## match genes
  mg <- intersect(x$gene, y$gene)
  x <- x[x$gene %in% mg, ]
  y <- y[y$gene %in% mg, ]
  cn <- unique(x$sample)
  rn <- unique(x$gene)
  m <- with(x, cbind(matrix(expression, ncol=length(cn), dimnames=list(rn, cn)), g=y$expression2))
  cor(m)
}

scorr(df_all, df_1)
#           a         b         c         g
# a 1.0000000 1.0000000 0.9971765 0.9873401
# b 1.0000000 1.0000000 0.9971765 0.9873401
# c 0.9971765 0.9971765 1.0000000 0.9838774
# g 0.9873401 0.9873401 0.9838774 1.0000000

scorr(df_all, df_2)
#           a         b         c         g
# a 1.0000000 1.0000000 0.9966446 0.9859516
# b 1.0000000 1.0000000 0.9966446 0.9859516
# c 0.9966446 0.9966446 1.0000000 0.9818871
# g 0.9859516 0.9859516 0.9818871 1.0000000

Data:

df_all <- structure(list(gene = c("a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", 
"k", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"), 
    sample = c("a", "a", "a", "a", "a", "a", "a", "a", "a", "a", 
    "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b", 
    "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"), expression = c(1L, 
    2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 14L, 15L, 16L, 
    17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 
    29L, 30L, 31L, 32L, 33L, 34L, 36L)), row.names = c(NA, -33L
), class = "data.frame")



df_1 <- structure(list(gene = c("a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k"), sample = c("g", "g", "g", "g", "g", "g", "g", 
"g", "g", "g", "g"), expression2 = c(14.7, 15, 17, 16, 18, 20, 
21, 22, 23, 24, 25)), row.names = c(NA, -11L), class = "data.frame")

df_2 <- df_1[-c(2, 5), ]