I have two gene expression DataFrames, one includes the predicted expression values (expr_mat1
) and the other includes the actual expression values (expr_mat2
) for a list of individuals. I am looking for to compute for each gene, the correlation coefficient (r) between these two in R.
Here is the structure of the both DataFrames.
expr_mat1 <- structure(list(IID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00104", "HG00106", "HG00108"
), ENSG00000177757 = c(0, 0.0307684166595614, 0, 0, 0.0307684166595614,
0.0615368333191228, 0, 0, 0, 0), ENSG00000225880 = c(-0.00798264310754959,
0.212374890223598, 0, 0, 0.220357533331148, 0.471551056712218,
-0.00798264310754959, -0.0388186331574725, -0.00798264310754959,
0), ENSG00000237491 = c(0, 0.0198094891643385, 0, 0, 0.0198094891643385,
0.0396189783286769, 0, 0, 0, 0), ENSG00000272438 = c(0.46351863909862,
0.508078766224831, 0.136340903763736, 0.396764255633314, 0.159346873334455,
0.014682408288098, 0.61932863325432, 0.316617042857239, 0.37814367178285,
0), ENSG00000230699 = c(0.503969263130909, 0.298187609689613,
0.0717403640054745, 0.0795822440269086, 0.101642119209388, 0.146101908549708,
0.43744959860811, 0.357867354581201, 0.278285110554293, -0.0277443791567545
), ENSG00000223764 = c(0.227149920782397, 0.510470328707094,
0.0975308442343844, 0.47092279297714, 0.187602385052443, 0.0413603570920838,
0.0957180228722546, 0.0957180228722546, 0.208612306162744, 0),
ENSG00000187961 = c(-0.223676988249188, -0.275463238643241,
0.00139582866980773, -0.0474096334801696, -0.175345193021951,
-0.146205255457578, -0.021750296188788, -0.0488225751745732,
-0.124209536362999, -0.0717343430355702), ENSG00000169972 = c(0.387031483027206,
0.19101953914156, 0.0589208026011161, 0.163994242580126,
0.0611907682839187, 0.179410915931091, 0.027732052654054,
0.219011741849398, 0.158963858620203, 0.0589208026011161),
ENSG00000248333 = c(-0.975293612634182, -0.71684852732366,
-0.429677500907912, -0.227608297432187, 0.0838011666206142,
-0.44738336909112, -0.375400739053435, -0.950699153047804,
-0.178804559724491, -0.406197173997756)), row.names = c(NA,
10L), class = "data.frame")
expr_mat2 <- structure(list(IID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00104", "HG00106", "HG00108"
), ENSG00000177757 = c(0, -0.0388186321574725, 0, 0, 0.0307684166595614,
0.0615368333191228, 0, 0, 0, 0), ENSG00000225880 = c(-0.00798264310754959,
0.212374890223598, 0, 0, 0.220357533331148, 0.471551056712218,
-0.00798264310754959, -0.0388186331574725, -0.00798264310754959,
0), ENSG00000237491 = c(0, 0.0198094891643385, 0, 0, 0.0198094891643385,
0.0396189783286769, 0, 0.1636954, 0, 0), ENSG00000272438 = c(0.46351863909862,
0.508078766224831, 0.136340903763736, 0, 0.159346873334455,
0.014682408288098, 1.8301125, 0.316617042857239, 0.37814367178285,
0), ENSG00000230699 = c(0.503969263130909, 0.298187609689613,
0.0717403640054745, 0.0795822440269086, 0.101642119209388, 0.146101908549708,
0.43744959860811, 0.357867354581201, 0.278285110554293, -0.0277443791567545
), ENSG00000223764 = c(0.49683948, 0.510470328707094,
0.0975308442343844, 0.47092279297714, 0.187602385052443, 0.0413603570920838,
0.0957180228722546, 0.0957180228722546, 0.208612306162744, 0),
ENSG00000187961 = c(-0.223676988249188, -0.275463238643241,
0.00139582866980773, -0.0474096334801696, -0.175345193021951,
-0.146205255457578, 0, -0.0488225751745732,
-0.124209536362999, -0.0717343430355702), ENSG00000169972 = c(0.387031483027206,
0.19101953914156, 0.0589208026011161, 0.163994242580126,
0.0611907682839187, 0.179410915931091, 0.027732052654054,
0.219011741849398, 0.158963858620203, 0.0589208026011161),
ENSG00000248333 = c(-0.975293612634182, -0.71684852732366,
0.49683948, -0.227608297432187, 0.0838011666206142,
-0.44738336909112, 0.49683948, -0.950699153047804,
-0.178804559724491, -0.406197173997756)), row.names = c(NA,
10L), class = "data.frame")
The desired output would be a DataFrame including two columns where the first colum is the ENSGs indexes and the second column is the corresponding correlation scores.
UPDATE: Here is the workaround I made following the @G. Grothendieck solution.
NOTE: Assueming that the number of rows and columns of the two DataFrames are not equal, I need first to deal with this issue. Thus, I got the intersection of the column names as well as the first column separately.
library(dplyr)
# get the common colnames
nms <- intersect(colnames(expr_mat1),
colnames(expr_mat2))
# filter based on colnames (expr_mat1)
expr_mat1_nms <- expr_mat1[, nms]
# filter based on colnames (expr_mat2)
expr_mat2 <- as.data.frame(expr_mat2)
expr_mat2_nms <- expr_mat2[, nms]
# get the common rows of the first column
indv <- intersect(expr_mat1_nms[, 1],
expr_mat2_nms[, 1])
# filter based on the first column (expr_mat1)
expr_mat1_nms_indv <- expr_mat1_nms[
expr_mat1_nms$IID %in% indv]
# filter based on the first column (expr_mat2)
expr_mat2_nms_indv <- expr_mat2_nms[
expr_mat2_nms$IID %in% indv]
# convert columns of data frame from character to numeric and remove the first column.
expr1 <- data.frame(apply(expr_mat1_nms_indv[,-1], 2,
function(x) as.numeric(as.character(x))))
expr2 <- data.frame(apply(expr_mat2_nms_indv[,-1], 2,
function(x) as.numeric(as.character(x))))
# compute correlation coefficient
corr <- cbind(x = stack(expr1), y = stack(expr2)) %>%
group_by(x.ind) %>%
summarize(cor = cor(x.values, y.values), .groups = "drop")
colnames(corr) <- c("ENSG", "Cor")
> print(corr)
# A tibble: 10 × 2
ENSG Cor
<fct> <dbl>
1 ENSG00000177757 0.407
2 ENSG00000225880 0.310
3 ENSG00000237491 0.121
4 ENSG00000230699 0.143
5 ENSG00000223764 0.196
6 ENSG00000187961 0.176
7 ENSG00000169972 -0.0209
8 ENSG00000248333 0.0515
9 ENSG00000131591 0.0270
10 ENSG00000160072 0.0477
CodePudding user response:
1) Convert each matrix to a data frame, remove the first column and then take correlations of corresponding columns. This gives a list and we use stack to convert that to a a 2 column data frame. At the end we reverse the order of its columns and add nicer coiumn names.
Map(cor, as.data.frame(expr_mat1)[-1], as.data.frame(expr_mat2)[-1]) |>
stack() |>
rev() |>
setNames(c("Name", "cor"))
## Name cor
## 1 ENSG00000177757 0.57845349
## 2 ENSG00000225880 1.00000000
## 3 ENSG00000237491 0.06779577
## 4 ENSG00000272438 0.72625656
## 5 ENSG00000230699 1.00000000
## 6 ENSG00000223764 0.90253534
## 7 ENSG00000187961 0.99765682
## 8 ENSG00000169972 1.00000000
## 9 ENSG00000248333 0.70022879
2) This alternative is inefficient but if that does not matter it avoids using Map. It computes the entire n x n correlation matrix of every column of expr_mat1[, -1] with every column of expr_mat2[, -1] and then takes the diagonal to get the correlations we need. Then it proceeds as above.
cor(expr_mat1[, -1], expr_mat2[, -1]) |>
diag() |>
stack() |>
rev() |>
setNames(c("Name", "cor"))
## Name cor
## 1 ENSG00000177757 0.57845349
## 2 ENSG00000225880 1.00000000
## 3 ENSG00000237491 0.06779577
## 4 ENSG00000272438 0.72625656
## 5 ENSG00000230699 1.00000000
## 6 ENSG00000223764 0.90253534
## 7 ENSG00000187961 0.99765682
## 8 ENSG00000169972 1.00000000
## 9 ENSG00000248333 0.70022879
3) Another approach is to convert the two matrices to long form using stack and then grouping by the original column names perform the correlations.
library(dplyr)
cbind(x = stack(expr_mat1[, -1]), y = stack(expr_mat2[, -1])) %>%
group_by(x.ind) %>%
summarize(cor = cor(x.values, y.values), .groups = "drop")
## # A tibble: 9 × 2
## x.ind cor
## <fct> <dbl>
## 1 ENSG00000177757 0.578
## 2 ENSG00000225880 1
## 3 ENSG00000237491 0.0678
## 4 ENSG00000272438 0.726
## 5 ENSG00000230699 1
## 6 ENSG00000223764 0.903
## 7 ENSG00000187961 0.998
## 8 ENSG00000169972 1
## 9 ENSG00000248333 0.700
4) Another approach is to create a complex number from corresponding elements of the two matrices giving a single matrix and the iterate over the columns picking apart the real and complex portions and taking the correlation of that. Then proceed as above.
apply(expr_mat1[, -1] expr_mat2[, -1] * 1i, 2, function(x) cor(Re(x), Im(x))) |>
stack() |>
rev() |>
setNames(c("Name", "cor"))
## Name cor
## 1 ENSG00000177757 0.57845349
## 2 ENSG00000225880 1.00000000
## 3 ENSG00000237491 0.06779577
## 4 ENSG00000272438 0.72625656
## 5 ENSG00000230699 1.00000000
## 6 ENSG00000223764 0.90253534
## 7 ENSG00000187961 0.99765682
## 8 ENSG00000169972 1.00000000
## 9 ENSG00000248333 0.70022879