Home > OS >  R: How do I sort a dataframe based on a numeric vector?
R: How do I sort a dataframe based on a numeric vector?

Time:04-11

I want to calculate the fold change between thyroid and testes dataframe using TPM values and provide the top 10 genes overexpressed in testes tissue (testes$gene_id in the testes dataframe).

In my code below, I first calculated the fold change and store it as a numeric vector tpm.foldchange but then I don't know how to sort the gene_id column of the testes dataframe based on the sorted fold-change values tpm.foldchange.

# Parse the gene results file from the testes and thyroid output
thyroid <- read.table("thyroid.genes.results", header=T, sep="\t")
testes <- read.table("testes.genes.results", header=T, sep="\t")

# Extract the TPM values
# Add one to each value and log them (base 2)
library(tidyverse)
thyroid.tpm <- log(thyroid %>% pull(TPM)   1)
testes.tpm <- log(testes %>% pull(TPM)   1)

# Pearson's correlation coefficient between thyroid and testes using TPM
cor(thyroid.tpm, testes.tpm, method="pearson")

# Calculate fold change between the testes and thyroid tissue TPM values and provide top 10 genes that are overexpressed in testes
library(gtools)
tpm.foldchange <- foldchange(testes.tpm, thyroid.tpm)
#tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.foldchange, decreasing=T)
tpm.sortedgenes <- testes[order(factor(testes$TPM, levels=tpm.sorted)),]
tpm.top10genes <- head(tpm.sortedgenes, 10)
testes[order(factor(testes$TPM, levels=tpm.sorted)),]

I initially wanted to sort after merging like this:

tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.df$tpm.foldchange, decreasing=T)

but it raised an error:

Error: cannot allocate vector of size 8.0 Gb

thyroid dataframe:

# Show only the first 20 rows, first column, and 6th column of thyroid dataframe
dput(thyroid[1:20, c(1,6)])

structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233", 
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA", 
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582", 
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L", 
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1", 
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(0, 45.96, 
2.72, 2.4, 1.67, 5.14, 4.33, 47.68, 81.1, 10.12, 0.96, 45.21, 
0, 19.63, 15.06, 0.49, 21.76, 12.16, 19.37, 5.3)), row.names = c(NA, 
20L), class = "data.frame")

testes dataframe:

# Show only the first 20 rows, first column, and 6th column of testes dataframe
dput(testes[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233", 
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA", 
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582", 
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L", 
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1", 
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(2.33, 47.56, 
9.45, 2.03, 3.09, 0.11, 3.73, 28.52, 120.65, 6.89, 1.38, 30.89, 
0, 20.39, 13.66, 0.59, 9.62, 22.04, 7.42, 2.53)), row.names = c(NA, 
20L), class = "data.frame")

Based on Akrun's comment, I've attempted:

library(gtools)
tpm.foldchange <- foldchange(thyroid.tpm, testes.tpm)
testes.sorted <- testes %>% 
  left_join(thyroid, by="gene_id") %>%
  mutate(TPM=testes.tpm, tpm.foldchange, .keep="unused") %>%
  slice_max(n=10, order_by=tpm.foldchange)

Output:

> dim(testes.sorted)
[1] 304  15

> dput(testes.sorted[1:10,])
structure(list(gene_id = c("gene10075_LOC101927056", "gene10311_A4GNT", 
"gene10394_SLC9A9-AS1", "gene10504_SUCNR1", "gene10511_TMEM14E", 
"gene10798_LOC102724550", "gene10990_FLJ42393", "gene11054_DPPA2P3", 
"gene11065_GP5", "gene11400_USP17L12"), transcript_id.s..x = c("rna28860_NR_125396.1,rna28861_NR_125395.1", 
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4", 
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1", 
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.x = c(659, 1771, 518, 1650, 1293, 2957, 2266, 1146, 
3493, 1593), effective_length.x = c(413.57, 1525.5, 272.62, 1404.5, 
1047.5, 2711.5, 2020.5, 900.5, 3247.5, 1347.5), expected_count.x = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0.12), TPM.x = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0), FPKM.x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), transcript_id.s..y = c("rna28860_NR_125396.1,rna28861_NR_125395.1", 
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4", 
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1", 
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.y = c(796, 1771, 518, 1650, 1293, 2957, 2266, 1146, 
3493, 1593), effective_length.y = c(535.05, 1510.04, 257.15, 
1389.04, 1032.04, 2696.04, 2005.04, 885.04, 3232.04, 1332.04), 
    expected_count.y = c(9, 3, 2, 233, 2, 2, 36, 2, 35, 1.91), 
    TPM.y = c(0.58, 0.07, 0.27, 5.8, 0.07, 0.03, 0.62, 0.08, 
    0.37, 0.05), FPKM.y = c(0.29, 0.03, 0.14, 2.94, 0.03, 0.01, 
    0.31, 0.04, 0.19, 0.03), TPM = c(0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0), tpm.foldchange = c(Inf, Inf, Inf, Inf, Inf, Inf, Inf, 
    Inf, Inf, Inf)), row.names = c(NA, 10L), class = "data.frame")

This code returns a dataframe with (304, 15) dimensions. But I'm only looking for the top ten genes. Also, please note that thyroid.tpm is the log2-transformed TPM values.

CodePudding user response:

If we want to order by the foldchange, do a join first, and arrange based on the foldchange between the 'TPM' columns

library(dplyr)
library(gtools)
testes2 <- testes %>% 
   left_join(thyroid,  by = 'gene_id') %>% 
   mutate(across(starts_with("TPM"), ~ log(.x   1),
      .names = "tpm_{.col}")) %>% 
   mutate(foldchange = foldchange(tpm_TPM.x, tpm_TPM.y)) %>% 
   filter(is.finite(foldchange)) %>%
   arrange(tpm_TPM.x) %>%
   dplyr::select(gene_id, TPM = TPM.x,  foldchange) %>%
   slice_head(n = 10)

If we want to select top 10 foldchange rows, use slice_max

testes %>%
    left_join(thyroid,  by = 'gene_id') %>% 
     mutate(TPM = TPM.x, foldchange = foldchange(log(TPM.x   1), log(TPM.y   1)),
       .keep = "unused") %>% 
     filter(is.finite(foldchange)) %>%
     slice_max(n = 10, order_by = foldchange, with_ties = FALSE)

-output

                  gene_id    TPM foldchange
1        gene100_C1orf233   9.45   1.786222
2          gene10000_CD86   3.09   1.434249
3  gene10007_LOC102723582   1.38   1.288517
4       gene10016_HSPBAP1  22.04   1.217311
5        gene10012_PARP15   0.59   1.162893
6       gene10005_FAM162A 120.65   1.089205
7         gene10010_PARP9  20.39   1.011953
8            gene1_WASH7P  47.56   1.008704
9         gene10011_DTX3L  13.66  -1.033968
10         gene10003_CSTA   3.73  -1.076854

The merge results in memory error because it was done on two vectors creating a cartesian join

  • Related