traspose columns and row in r and extracting certain values-CodePudding

I'm new on R, so apologies in advance. I have a data.frame (df_final_a) with 800 columns and 16.000 rows. It looks like this:

**chrom gene mutation patient1 patient2 patient3**
 
  chr1   abc    xxx      ./.     0/1    ./. 
  chr1   def    yxx      1/1     ./.    0/1 
  chr2   ghi    yyx      0/1     ./.    ./. 
  chr2   lmn    yyy      ./.     1/1    ./.

./. = this means that the patient doesn't have that mutation.

0/1 and other numbers mean that the patient has a certain type of that mutation

From this data frame I want to have two other tables:

dfA:

    **patient    mutation**           

      patient1     yxx(1/1), yyx(0/1) 
      patient2     xxx(0/1), yyy(1/1)
      patient3     yxx(0/1)

dfB:

    **patient            chrom gene   mutation**

      patient2           chr1   abc     xxx       
      patient1, patient3 chr1   def     yxx        
      patient1           chr2   ghi     yyx        
      patient2           chr2   lmn     yyy

thank you for any suggestion you can give me

CodePudding user response：

It will be easier to work with this data if you reshape it into longer (aka tidy) format, with one row per observation of patient-chrom-gene-mutation. Then the two outputs you're looking for can each be made with a group_by and summarize:

library(dplyr); library(tidyr)
df_long <- df_final_a %>%
  pivot_longer(starts_with("patient"), names_to = "patient") %>%
  filter(value != "./.")  

dfA <- df_long %>%
  group_by(patient) %>%
  summarize(mutation = paste0(mutation, "(", value, ")", collapse = ", "))

#> dfA
## A tibble: 3 × 2
#  patient  mutation          
#  <chr>    <chr>             
#1 patient1 yxx(1/1), yyx(0/1)
#2 patient2 xxx(0/1), yyy(1/1)
#3 patient3 yxx(0/1) 


dfB <- df_long %>%
  group_by(chrom, gene, mutation) %>%
  summarize(patient = paste(patient, collapse = ", "), .groups = "drop") %>%
  relocate(patient, .before = 1)

#> dfB
## A tibble: 4 × 4
#  patient            chrom gene  mutation
#  <chr>              <chr> <chr> <chr>   
#1 patient2           chr1  abc   xxx     
#2 patient1, patient3 chr1  def   yxx     
#3 patient1           chr2  ghi   yyx     
#4 patient2           chr2  lmn   yyy

Sample data

df_final_a <- data.frame(
  stringsAsFactors = FALSE,
             chrom = c("chr1", "chr1", "chr2", "chr2"),
              gene = c("abc", "def", "ghi", "lmn"),
          mutation = c("xxx", "yxx", "yyx", "yyy"),
          patient1 = c("./.", "1/1", "0/1", "./."),
          patient2 = c("0/1", "./.", "./.", "1/1"),
          patient3 = c("./.", "0/1", "./.", "./.")
)

CodePudding user response：

library(tidyverse)
library(tibble)

tbl <- tribble(
  ~chrom, ~gene, ~mutation, ~patient1, ~patient2, ~patient3,
  'chr1',   'abc',    'xxx',      './.',     '0/1',    './.',
  'chr1',   'def',    'yxx',      '1/1',     './.',    '0/1', 
  'chr2',   'ghi',    'yyx',      '0/1',     './.',    './.', 
  'chr2',   'lmn',    'yyy',      './.',     '1/1',    './.'
)


(
  dfA <-
    pivot_longer(tbl, cols = starts_with("patient"),
                 names_to = "patient") |>
    filter(value != "./.") |>
    mutate(mutation = glue::glue("{mutation}({value})")) |>
    group_by(patient) |>
    summarise(mutation = paste(mutation, collapse = ", "))
)
#> # A tibble: 3 × 2
#>   patient  mutation          
#>   <chr>    <chr>             
#> 1 patient1 yxx(1/1), yyx(0/1)
#> 2 patient2 xxx(0/1), yyy(1/1)
#> 3 patient3 yxx(0/1)


(
  dfB <-
    pivot_longer(tbl, cols = starts_with("patient"),
                 names_to = "patient") |>
    filter(value != "./.") |>
    group_by(chrom, gene, mutation) |>
    summarise(patient = paste(patient, collapse = ", ")) |>
    relocate(patient, .before = 1)
)
#> `summarise()` has grouped output by 'chrom', 'gene'. You can override using the
#> `.groups` argument.
#> # A tibble: 4 × 4
#> # Groups:   chrom, gene [4]
#>   patient            chrom gene  mutation
#>   <chr>              <chr> <chr> <chr>   
#> 1 patient2           chr1  abc   xxx     
#> 2 patient1, patient3 chr1  def   yxx     
#> 3 patient1           chr2  ghi   yyx     
#> 4 patient2           chr2  lmn   yyy