I'm new on R, so apologies in advance. I have a data.frame (df_final_a) with 800 columns and 16.000 rows. It looks like this:
**chrom gene mutation patient1 patient2 patient3**
chr1 abc xxx ./. 0/1 ./.
chr1 def yxx 1/1 ./. 0/1
chr2 ghi yyx 0/1 ./. ./.
chr2 lmn yyy ./. 1/1 ./.
./. = this means that the patient doesn't have that mutation.
0/1 and other numbers mean that the patient has a certain type of that mutation
From this data frame I want to have two other tables:
dfA:
**patient mutation**
patient1 yxx(1/1), yyx(0/1)
patient2 xxx(0/1), yyy(1/1)
patient3 yxx(0/1)
dfB:
**patient chrom gene mutation**
patient2 chr1 abc xxx
patient1, patient3 chr1 def yxx
patient1 chr2 ghi yyx
patient2 chr2 lmn yyy
thank you for any suggestion you can give me
CodePudding user response:
It will be easier to work with this data if you reshape it into longer (aka tidy) format, with one row per observation of patient-chrom-gene-mutation. Then the two outputs you're looking for can each be made with a group_by
and summarize
:
library(dplyr); library(tidyr)
df_long <- df_final_a %>%
pivot_longer(starts_with("patient"), names_to = "patient") %>%
filter(value != "./.")
dfA <- df_long %>%
group_by(patient) %>%
summarize(mutation = paste0(mutation, "(", value, ")", collapse = ", "))
#> dfA
## A tibble: 3 × 2
# patient mutation
# <chr> <chr>
#1 patient1 yxx(1/1), yyx(0/1)
#2 patient2 xxx(0/1), yyy(1/1)
#3 patient3 yxx(0/1)
dfB <- df_long %>%
group_by(chrom, gene, mutation) %>%
summarize(patient = paste(patient, collapse = ", "), .groups = "drop") %>%
relocate(patient, .before = 1)
#> dfB
## A tibble: 4 × 4
# patient chrom gene mutation
# <chr> <chr> <chr> <chr>
#1 patient2 chr1 abc xxx
#2 patient1, patient3 chr1 def yxx
#3 patient1 chr2 ghi yyx
#4 patient2 chr2 lmn yyy
Sample data
df_final_a <- data.frame(
stringsAsFactors = FALSE,
chrom = c("chr1", "chr1", "chr2", "chr2"),
gene = c("abc", "def", "ghi", "lmn"),
mutation = c("xxx", "yxx", "yyx", "yyy"),
patient1 = c("./.", "1/1", "0/1", "./."),
patient2 = c("0/1", "./.", "./.", "1/1"),
patient3 = c("./.", "0/1", "./.", "./.")
)
CodePudding user response:
library(tidyverse)
library(tibble)
tbl <- tribble(
~chrom, ~gene, ~mutation, ~patient1, ~patient2, ~patient3,
'chr1', 'abc', 'xxx', './.', '0/1', './.',
'chr1', 'def', 'yxx', '1/1', './.', '0/1',
'chr2', 'ghi', 'yyx', '0/1', './.', './.',
'chr2', 'lmn', 'yyy', './.', '1/1', './.'
)
(
dfA <-
pivot_longer(tbl, cols = starts_with("patient"),
names_to = "patient") |>
filter(value != "./.") |>
mutate(mutation = glue::glue("{mutation}({value})")) |>
group_by(patient) |>
summarise(mutation = paste(mutation, collapse = ", "))
)
#> # A tibble: 3 × 2
#> patient mutation
#> <chr> <chr>
#> 1 patient1 yxx(1/1), yyx(0/1)
#> 2 patient2 xxx(0/1), yyy(1/1)
#> 3 patient3 yxx(0/1)
(
dfB <-
pivot_longer(tbl, cols = starts_with("patient"),
names_to = "patient") |>
filter(value != "./.") |>
group_by(chrom, gene, mutation) |>
summarise(patient = paste(patient, collapse = ", ")) |>
relocate(patient, .before = 1)
)
#> `summarise()` has grouped output by 'chrom', 'gene'. You can override using the
#> `.groups` argument.
#> # A tibble: 4 × 4
#> # Groups: chrom, gene [4]
#> patient chrom gene mutation
#> <chr> <chr> <chr> <chr>
#> 1 patient2 chr1 abc xxx
#> 2 patient1, patient3 chr1 def yxx
#> 3 patient1 chr2 ghi yyx
#> 4 patient2 chr2 lmn yyy