I want to calculate the fold change between thyroid
and testes
dataframe using TPM values and provide the top 10 genes overexpressed in testes tissue (testes$gene_id
in the testes
dataframe).
In my code below, I first calculated the fold change and store it as a numeric vector tpm.foldchange
but then I don't know how to sort the gene_id
column of the testes
dataframe based on the sorted fold-change values tpm.foldchange
.
# Parse the gene results file from the testes and thyroid output
thyroid <- read.table("thyroid.genes.results", header=T, sep="\t")
testes <- read.table("testes.genes.results", header=T, sep="\t")
# Extract the TPM values
# Add one to each value and log them (base 2)
library(tidyverse)
thyroid.tpm <- log(thyroid %>% pull(TPM) 1)
testes.tpm <- log(testes %>% pull(TPM) 1)
# Pearson's correlation coefficient between thyroid and testes using TPM
cor(thyroid.tpm, testes.tpm, method="pearson")
# Calculate fold change between the testes and thyroid tissue TPM values and provide top 10 genes that are overexpressed in testes
library(gtools)
tpm.foldchange <- foldchange(testes.tpm, thyroid.tpm)
#tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.foldchange, decreasing=T)
tpm.sortedgenes <- testes[order(factor(testes$TPM, levels=tpm.sorted)),]
tpm.top10genes <- head(tpm.sortedgenes, 10)
testes[order(factor(testes$TPM, levels=tpm.sorted)),]
I initially wanted to sort after merging like this:
tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.df$tpm.foldchange, decreasing=T)
but it raised an error:
Error: cannot allocate vector of size 8.0 Gb
thyroid
dataframe:
# Show only the first 20 rows, first column, and 6th column of thyroid dataframe
dput(thyroid[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(0, 45.96,
2.72, 2.4, 1.67, 5.14, 4.33, 47.68, 81.1, 10.12, 0.96, 45.21,
0, 19.63, 15.06, 0.49, 21.76, 12.16, 19.37, 5.3)), row.names = c(NA,
20L), class = "data.frame")
testes
dataframe:
# Show only the first 20 rows, first column, and 6th column of testes dataframe
dput(testes[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(2.33, 47.56,
9.45, 2.03, 3.09, 0.11, 3.73, 28.52, 120.65, 6.89, 1.38, 30.89,
0, 20.39, 13.66, 0.59, 9.62, 22.04, 7.42, 2.53)), row.names = c(NA,
20L), class = "data.frame")
Based on Akrun's comment, I've attempted:
library(gtools)
tpm.foldchange <- foldchange(thyroid.tpm, testes.tpm)
testes.sorted <- testes %>%
left_join(thyroid, by="gene_id") %>%
mutate(TPM=testes.tpm, tpm.foldchange, .keep="unused") %>%
slice_max(n=10, order_by=tpm.foldchange)
Output:
> dim(testes.sorted)
[1] 304 15
> dput(testes.sorted[1:10,])
structure(list(gene_id = c("gene10075_LOC101927056", "gene10311_A4GNT",
"gene10394_SLC9A9-AS1", "gene10504_SUCNR1", "gene10511_TMEM14E",
"gene10798_LOC102724550", "gene10990_FLJ42393", "gene11054_DPPA2P3",
"gene11065_GP5", "gene11400_USP17L12"), transcript_id.s..x = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.x = c(659, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.x = c(413.57, 1525.5, 272.62, 1404.5,
1047.5, 2711.5, 2020.5, 900.5, 3247.5, 1347.5), expected_count.x = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0.12), TPM.x = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), FPKM.x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), transcript_id.s..y = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.y = c(796, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.y = c(535.05, 1510.04, 257.15,
1389.04, 1032.04, 2696.04, 2005.04, 885.04, 3232.04, 1332.04),
expected_count.y = c(9, 3, 2, 233, 2, 2, 36, 2, 35, 1.91),
TPM.y = c(0.58, 0.07, 0.27, 5.8, 0.07, 0.03, 0.62, 0.08,
0.37, 0.05), FPKM.y = c(0.29, 0.03, 0.14, 2.94, 0.03, 0.01,
0.31, 0.04, 0.19, 0.03), TPM = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), tpm.foldchange = c(Inf, Inf, Inf, Inf, Inf, Inf, Inf,
Inf, Inf, Inf)), row.names = c(NA, 10L), class = "data.frame")
This code returns a dataframe with (304, 15) dimensions. But I'm only looking for the top ten genes. Also, please note that thyroid.tpm
is the log2-transformed TPM values.
CodePudding user response:
If we want to order by the foldchange
, do a join first, and arrange
based on the foldchange between the 'TPM' columns
library(dplyr)
library(gtools)
testes2 <- testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(across(starts_with("TPM"), ~ log(.x 1),
.names = "tpm_{.col}")) %>%
mutate(foldchange = foldchange(tpm_TPM.x, tpm_TPM.y)) %>%
filter(is.finite(foldchange)) %>%
arrange(tpm_TPM.x) %>%
dplyr::select(gene_id, TPM = TPM.x, foldchange) %>%
slice_head(n = 10)
If we want to select top 10 foldchange rows, use slice_max
testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(TPM = TPM.x, foldchange = foldchange(log(TPM.x 1), log(TPM.y 1)),
.keep = "unused") %>%
filter(is.finite(foldchange)) %>%
slice_max(n = 10, order_by = foldchange, with_ties = FALSE)
-output
gene_id TPM foldchange
1 gene100_C1orf233 9.45 1.786222
2 gene10000_CD86 3.09 1.434249
3 gene10007_LOC102723582 1.38 1.288517
4 gene10016_HSPBAP1 22.04 1.217311
5 gene10012_PARP15 0.59 1.162893
6 gene10005_FAM162A 120.65 1.089205
7 gene10010_PARP9 20.39 1.011953
8 gene1_WASH7P 47.56 1.008704
9 gene10011_DTX3L 13.66 -1.033968
10 gene10003_CSTA 3.73 -1.076854
The merge
results in memory error because it was done on two vectors creating a cartesian join