I have a long dataframe like this:
df <- structure(list(Tumor_type = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("epithelial",
"lymphoma", "healthy", "sarcoma"), class = "factor"), Gene = c("A",
"B", "C", "A", "B", "C", "A", "B", "C", "A", "A", "B", "C", "A",
"B", "C", "A", "B", "C", "A"), value = c(55.9228661170814, 63.4145784380524,
207.26299945198, 14.3567918830159, 55.3521592504006, 185.331104118272,
27.6007163612577, 239.913726358976, 153.875266787649, 13.5284557778013,
22.5884252969717, 81.9416366341296, 197.881154317385, 13.3558001634159,
167.58802932121, 79.1822202852964, 37.9496510246124, 85.245769235316,
116.807115796469, 21.8110255970795)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
>df
Tumor_type Gene value
<fct> <chr> <dbl>
epithelial A 55.92287
epithelial B 63.41458
epithelial C 207.26300
epithelial A 14.35679
epithelial B 55.35216
epithelial C 185.33110
epithelial A 27.60072
epithelial B 239.91373
epithelial C 153.87527
epithelial A 13.52846
lymphoma A 22.58843
lymphoma B 81.94164
lymphoma C 197.88115
lymphoma A 13.35580
lymphoma B 167.58803
lymphoma C 79.18222
lymphoma A 37.94965
lymphoma B 85.24577
lymphoma C 116.80712
lymphoma A 21.81103
I would like to create a dataframe with Log2FC and corresponding p-values from t-test. Now I can create dataframe only with Log2FC values like so:
LFC <- df %>%
group_by(Tumor_type, Gene) %>%
summarise(mean = mean(value)) %>%
ungroup() %>%
group_by(Gene) %>%
mutate(lfc = log2(mean[2] / mean[1])) %>%
ungroup() %>%
filter(Tumor_type == "epithelial") %>%
select(-mean)
>LFC
Tumor_type Gene lfc
<fct> <chr> <dbl>
epithelial A -0.2191989
epithelial B -0.0995055
epithelial C -0.4724193
I want to add column with corresponding p-values which i can manually calculate:
t.test(df[df$Tumor_type == "epithelial" & df$Gene == "A",]$value,
df[df$Tumor_type == "lymphoma" & df$Gene == "A",]$value)$p.value
0.74
So the question is how can i create final table with p-value column like this?
Tumor_type Gene lfc p-value
<fct> <chr> <dbl> <dbl>
epithelial A -0.2191989 0.7404867
epithelial B -0.0995055 0.9125193
epithelial C -0.4724193 0.2834448
CodePudding user response:
An easier option is to reshape to 'wide' format with pivot_wider
first and then do the group by summarise
library(dplyr)
library(tidyr)
library(data.table)
df %>%
# create a sequence column by Tumor_type
mutate(rn = rowid(Tumor_type)) %>%
# reshape to wide
pivot_wider(names_from = Tumor_type, values_from = value) %>%
# grouped by Gene
group_by(Gene) %>%
# get the log of divided mean values from lymphoma, epithelial
summarise(lfc = log2(mean(lymphoma)/mean(epithelial)),
# apply the t.test on the columns and get the p.value
p_value = t.test(epithelial, lymphoma)$p.value) %>%
mutate(Tumor_type = 'epithelial', .before = 1)
# A tibble: 3 × 4
Tumor_type Gene lfc p_value
<chr> <chr> <dbl> <dbl>
1 epithelial A -0.219 0.740
2 epithelial B -0.0995 0.913
3 epithelial C -0.472 0.283
CodePudding user response:
Also, another possible solution is just to stack all the values as a list to call t.test easy. It's a bit like using pivot_wider but less code.
df %>%
group_by(Tumor_type,Gene) %>%
summarise(values = list(value)) %>%
group_by(Gene) %>%
summarise(lfc = log2(mean(values[[2]]) /mean(values[[1]])),
p_value= t.test(values[[1]],values[[2]])$p.value)
Output:
# A tibble: 3 x 3
Gene lfc p_value
<chr> <dbl> <dbl>
1 A -0.219 0.740
2 B -0.0995 0.913
3 C -0.472 0.283