Correlation coefficient between two dataframe in R-CodePudding

I have two gene expression DataFrames, one includes the predicted expression values (expr_mat1) and the other includes the actual expression values (expr_mat2) for a list of individuals. I am looking for to compute for each gene, the correlation coefficient (r) between these two in R.

Here is the structure of the both DataFrames.

expr_mat1 <- structure(list(IID = c("HG00096", "HG00097", "HG00099", "HG00100", 
"HG00101", "HG00102", "HG00103", "HG00104", "HG00106", "HG00108"
), ENSG00000177757 = c(0, 0.0307684166595614, 0, 0, 0.0307684166595614, 
0.0615368333191228, 0, 0, 0, 0), ENSG00000225880 = c(-0.00798264310754959, 
0.212374890223598, 0, 0, 0.220357533331148, 0.471551056712218, 
-0.00798264310754959, -0.0388186331574725, -0.00798264310754959, 
0), ENSG00000237491 = c(0, 0.0198094891643385, 0, 0, 0.0198094891643385, 
0.0396189783286769, 0, 0, 0, 0), ENSG00000272438 = c(0.46351863909862, 
0.508078766224831, 0.136340903763736, 0.396764255633314, 0.159346873334455, 
0.014682408288098, 0.61932863325432, 0.316617042857239, 0.37814367178285, 
0), ENSG00000230699 = c(0.503969263130909, 0.298187609689613, 
0.0717403640054745, 0.0795822440269086, 0.101642119209388, 0.146101908549708, 
0.43744959860811, 0.357867354581201, 0.278285110554293, -0.0277443791567545
), ENSG00000223764 = c(0.227149920782397, 0.510470328707094, 
0.0975308442343844, 0.47092279297714, 0.187602385052443, 0.0413603570920838, 
0.0957180228722546, 0.0957180228722546, 0.208612306162744, 0), 
    ENSG00000187961 = c(-0.223676988249188, -0.275463238643241, 
    0.00139582866980773, -0.0474096334801696, -0.175345193021951, 
    -0.146205255457578, -0.021750296188788, -0.0488225751745732, 
    -0.124209536362999, -0.0717343430355702), ENSG00000169972 = c(0.387031483027206, 
    0.19101953914156, 0.0589208026011161, 0.163994242580126, 
    0.0611907682839187, 0.179410915931091, 0.027732052654054, 
    0.219011741849398, 0.158963858620203, 0.0589208026011161), 
    ENSG00000248333 = c(-0.975293612634182, -0.71684852732366, 
    -0.429677500907912, -0.227608297432187, 0.0838011666206142, 
    -0.44738336909112, -0.375400739053435, -0.950699153047804, 
    -0.178804559724491, -0.406197173997756)), row.names = c(NA, 
10L), class = "data.frame")


expr_mat2 <- structure(list(IID = c("HG00096", "HG00097", "HG00099", "HG00100", 
"HG00101", "HG00102", "HG00103", "HG00104", "HG00106", "HG00108"
), ENSG00000177757 = c(0, -0.0388186321574725, 0, 0, 0.0307684166595614, 
0.0615368333191228, 0, 0, 0, 0), ENSG00000225880 = c(-0.00798264310754959, 
0.212374890223598, 0, 0, 0.220357533331148, 0.471551056712218, 
-0.00798264310754959, -0.0388186331574725, -0.00798264310754959, 
0), ENSG00000237491 = c(0, 0.0198094891643385, 0, 0, 0.0198094891643385, 
0.0396189783286769, 0, 0.1636954, 0, 0), ENSG00000272438 = c(0.46351863909862, 
0.508078766224831, 0.136340903763736, 0, 0.159346873334455, 
0.014682408288098, 1.8301125, 0.316617042857239, 0.37814367178285, 
0), ENSG00000230699 = c(0.503969263130909, 0.298187609689613, 
0.0717403640054745, 0.0795822440269086, 0.101642119209388, 0.146101908549708, 
0.43744959860811, 0.357867354581201, 0.278285110554293, -0.0277443791567545
), ENSG00000223764 = c(0.49683948, 0.510470328707094, 
0.0975308442343844, 0.47092279297714, 0.187602385052443, 0.0413603570920838, 
0.0957180228722546, 0.0957180228722546, 0.208612306162744, 0), 
    ENSG00000187961 = c(-0.223676988249188, -0.275463238643241, 
    0.00139582866980773, -0.0474096334801696, -0.175345193021951, 
    -0.146205255457578, 0, -0.0488225751745732, 
    -0.124209536362999, -0.0717343430355702), ENSG00000169972 = c(0.387031483027206, 
    0.19101953914156, 0.0589208026011161, 0.163994242580126, 
    0.0611907682839187, 0.179410915931091, 0.027732052654054, 
    0.219011741849398, 0.158963858620203, 0.0589208026011161), 
    ENSG00000248333 = c(-0.975293612634182, -0.71684852732366, 
    0.49683948, -0.227608297432187, 0.0838011666206142, 
    -0.44738336909112, 0.49683948, -0.950699153047804, 
    -0.178804559724491, -0.406197173997756)), row.names = c(NA, 
10L), class = "data.frame")

The desired output would be a DataFrame including two columns where the first colum is the ENSGs indexes and the second column is the corresponding correlation scores.

UPDATE: Here is the workaround I made following the @G. Grothendieck solution.

NOTE: Assueming that the number of rows and columns of the two DataFrames are not equal, I need first to deal with this issue. Thus, I got the intersection of the column names as well as the first column separately.

library(dplyr)
# get the common colnames
nms <- intersect(colnames(expr_mat1), 
                 colnames(expr_mat2))
       
       # filter based on colnames (expr_mat1)
       expr_mat1_nms <- expr_mat1[, nms]
       
       # filter based on colnames (expr_mat2)
       expr_mat2 <- as.data.frame(expr_mat2)
       expr_mat2_nms <- expr_mat2[, nms]

 # get the common rows of the first column
 indv <- intersect(expr_mat1_nms[, 1], 
                   expr_mat2_nms[, 1])

       # filter based on the first column (expr_mat1)
       expr_mat1_nms_indv <- expr_mat1_nms[
                                  expr_mat1_nms$IID %in% indv]
       
       # filter based on the first column (expr_mat2)
       expr_mat2_nms_indv <- expr_mat2_nms[
                                  expr_mat2_nms$IID %in% indv]

 # convert columns of data frame from character to numeric and remove the first column.
 expr1 <- data.frame(apply(expr_mat1_nms_indv[,-1], 2,
                           function(x) as.numeric(as.character(x))))

 expr2 <- data.frame(apply(expr_mat2_nms_indv[,-1], 2, 
                          function(x) as.numeric(as.character(x))))

 # compute correlation coefficient
 corr <- cbind(x = stack(expr1), y = stack(expr2)) %>% 
          group_by(x.ind) %>%
          summarize(cor = cor(x.values, y.values), .groups = "drop")
 colnames(corr) <- c("ENSG", "Cor")

> print(corr)  
# A tibble: 10 × 2
   ENSG                Cor
   <fct>             <dbl>
 1 ENSG00000177757  0.407 
 2 ENSG00000225880  0.310 
 3 ENSG00000237491  0.121 
 4 ENSG00000230699  0.143 
 5 ENSG00000223764  0.196 
 6 ENSG00000187961  0.176 
 7 ENSG00000169972 -0.0209
 8 ENSG00000248333  0.0515
 9 ENSG00000131591  0.0270
10 ENSG00000160072  0.0477

CodePudding user response：

1) Convert each matrix to a data frame, remove the first column and then take correlations of corresponding columns. This gives a list and we use stack to convert that to a a 2 column data frame. At the end we reverse the order of its columns and add nicer coiumn names.

Map(cor, as.data.frame(expr_mat1)[-1], as.data.frame(expr_mat2)[-1]) |>
  stack() |>
  rev() |>
  setNames(c("Name", "cor"))
##              Name        cor
## 1 ENSG00000177757 0.57845349
## 2 ENSG00000225880 1.00000000
## 3 ENSG00000237491 0.06779577
## 4 ENSG00000272438 0.72625656
## 5 ENSG00000230699 1.00000000
## 6 ENSG00000223764 0.90253534
## 7 ENSG00000187961 0.99765682
## 8 ENSG00000169972 1.00000000
## 9 ENSG00000248333 0.70022879

2) This alternative is inefficient but if that does not matter it avoids using Map. It computes the entire n x n correlation matrix of every column of expr_mat1[, -1] with every column of expr_mat2[, -1] and then takes the diagonal to get the correlations we need. Then it proceeds as above.

cor(expr_mat1[, -1], expr_mat2[, -1]) |>
  diag() |>
  stack() |>
  rev() |>
  setNames(c("Name", "cor"))
##              Name        cor
## 1 ENSG00000177757 0.57845349
## 2 ENSG00000225880 1.00000000
## 3 ENSG00000237491 0.06779577
## 4 ENSG00000272438 0.72625656
## 5 ENSG00000230699 1.00000000
## 6 ENSG00000223764 0.90253534
## 7 ENSG00000187961 0.99765682
## 8 ENSG00000169972 1.00000000
## 9 ENSG00000248333 0.70022879

3) Another approach is to convert the two matrices to long form using stack and then grouping by the original column names perform the correlations.

library(dplyr)

cbind(x = stack(expr_mat1[, -1]), y = stack(expr_mat2[, -1])) %>% 
  group_by(x.ind) %>%
  summarize(cor = cor(x.values, y.values), .groups = "drop")
## # A tibble: 9 × 2
##   x.ind              cor
##   <fct>            <dbl>
## 1 ENSG00000177757 0.578 
## 2 ENSG00000225880 1     
## 3 ENSG00000237491 0.0678
## 4 ENSG00000272438 0.726 
## 5 ENSG00000230699 1     
## 6 ENSG00000223764 0.903 
## 7 ENSG00000187961 0.998 
## 8 ENSG00000169972 1     
## 9 ENSG00000248333 0.700

4) Another approach is to create a complex number from corresponding elements of the two matrices giving a single matrix and the iterate over the columns picking apart the real and complex portions and taking the correlation of that. Then proceed as above.

apply(expr_mat1[, -1]   expr_mat2[, -1] * 1i, 2, function(x) cor(Re(x), Im(x))) |>
  stack() |>
  rev() |>
  setNames(c("Name", "cor"))
##              Name        cor
## 1 ENSG00000177757 0.57845349
## 2 ENSG00000225880 1.00000000
## 3 ENSG00000237491 0.06779577
## 4 ENSG00000272438 0.72625656
## 5 ENSG00000230699 1.00000000
## 6 ENSG00000223764 0.90253534
## 7 ENSG00000187961 0.99765682
## 8 ENSG00000169972 1.00000000
## 9 ENSG00000248333 0.70022879