Calculate p-values for each group-CodePudding

I have a long dataframe like this:

df <- structure(list(Tumor_type = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("epithelial", 
"lymphoma", "healthy", "sarcoma"), class = "factor"), Gene = c("A", 
"B", "C", "A", "B", "C", "A", "B", "C", "A", "A", "B", "C", "A", 
"B", "C", "A", "B", "C", "A"), value = c(55.9228661170814, 63.4145784380524, 
207.26299945198, 14.3567918830159, 55.3521592504006, 185.331104118272, 
27.6007163612577, 239.913726358976, 153.875266787649, 13.5284557778013, 
22.5884252969717, 81.9416366341296, 197.881154317385, 13.3558001634159, 
167.58802932121, 79.1822202852964, 37.9496510246124, 85.245769235316, 
116.807115796469, 21.8110255970795)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))

>df
Tumor_type  Gene    value
<fct>   <chr>   <dbl>
epithelial  A   55.92287
epithelial  B   63.41458
epithelial  C   207.26300
epithelial  A   14.35679
epithelial  B   55.35216
epithelial  C   185.33110
epithelial  A   27.60072
epithelial  B   239.91373
epithelial  C   153.87527
epithelial  A   13.52846
lymphoma    A   22.58843
lymphoma    B   81.94164
lymphoma    C   197.88115
lymphoma    A   13.35580
lymphoma    B   167.58803
lymphoma    C   79.18222
lymphoma    A   37.94965
lymphoma    B   85.24577
lymphoma    C   116.80712
lymphoma    A   21.81103

I would like to create a dataframe with Log2FC and corresponding p-values from t-test. Now I can create dataframe only with Log2FC values like so:

LFC <- df %>% 
    group_by(Tumor_type, Gene) %>% 
    summarise(mean = mean(value)) %>% 
    ungroup() %>% 
    group_by(Gene) %>% 
    mutate(lfc = log2(mean[2] / mean[1])) %>% 
    ungroup() %>% 
    filter(Tumor_type == "epithelial") %>% 
    select(-mean)

>LFC
Tumor_type  Gene    lfc
<fct>   <chr>   <dbl>
epithelial  A   -0.2191989
epithelial  B   -0.0995055
epithelial  C   -0.4724193

I want to add column with corresponding p-values which i can manually calculate:

t.test(df[df$Tumor_type == "epithelial" & df$Gene == "A",]$value,
      df[df$Tumor_type == "lymphoma" & df$Gene == "A",]$value)$p.value

0.74

So the question is how can i create final table with p-value column like this?

Tumor_type  Gene    lfc p-value
<fct>   <chr>   <dbl>   <dbl>
epithelial  A   -0.2191989  0.7404867
epithelial  B   -0.0995055  0.9125193
epithelial  C   -0.4724193  0.2834448

CodePudding user response：

An easier option is to reshape to 'wide' format with pivot_wider first and then do the group by summarise

library(dplyr)
library(tidyr)
library(data.table)
df %>%
  # create a sequence column by Tumor_type 
  mutate(rn = rowid(Tumor_type)) %>%
  # reshape to wide
  pivot_wider(names_from = Tumor_type, values_from = value) %>%  
  # grouped by Gene
  group_by(Gene) %>% 
  # get the log of divided mean values from lymphoma, epithelial
  summarise(lfc = log2(mean(lymphoma)/mean(epithelial)),
           # apply the t.test on the columns and get the p.value
            p_value = t.test(epithelial, lymphoma)$p.value) %>% 
  mutate(Tumor_type = 'epithelial', .before = 1)
# A tibble: 3 × 4
  Tumor_type Gene      lfc p_value
  <chr>      <chr>   <dbl>   <dbl>
1 epithelial A     -0.219    0.740
2 epithelial B     -0.0995   0.913
3 epithelial C     -0.472    0.283

CodePudding user response：

Also, another possible solution is just to stack all the values as a list to call t.test easy. It's a bit like using pivot_wider but less code.

df %>% 
  group_by(Tumor_type,Gene) %>% 
  summarise(values = list(value)) %>%
  group_by(Gene) %>% 
  summarise(lfc = log2(mean(values[[2]]) /mean(values[[1]])),
            p_value= t.test(values[[1]],values[[2]])$p.value)

Output:

# A tibble: 3 x 3
  Gene      lfc  p_value
  <chr>   <dbl> <dbl>
1 A     -0.219  0.740
2 B     -0.0995 0.913
3 C     -0.472  0.283