Home > Software engineering >  Is it possible to pivot_longer multiple columns with one name column and two values columns?
Is it possible to pivot_longer multiple columns with one name column and two values columns?

Time:01-21

This is how my data looks like with columns matching a gen ID and a timepoint (_01 or _03)


        id          sample_name placa_ppard_01 placa_pparg_01 placa_nr1h3_01
1 50109018 A2535500018000006001       17.17521       24.10186       22.05157
2 50109019 A2535500019000006001       17.17521       24.10186       22.05157
3 50109025 A2535500025000006001       17.17521       24.10186       22.05157
4 50109026 A2535500026000006001       17.17521       24.10186       22.05157
5 50109027 A2535500027000006001       17.33383       24.38065       21.98426
6 50118001 A2536300001000006001       17.17521       24.10186       22.05157
  placa_nr1h2_01 placa_ldlr_03 placa_pcsk9_03 placa_olr1_03 placa_msr1_03
1       18.41557      18.89107             NA      23.93779      20.58086
2       18.41557      18.89107             NA      23.93779      20.58086
3       18.41557      18.89107             NA      23.93779      20.58086
4       18.41557      18.89107             NA      23.93779      20.58086
5       18.37935      19.12857             NA      23.22024      20.36100
6       18.41557      18.89107             NA      23.93779      20.58086


### a small piece of data

df <- structure(list(id = structure(c(130102009, 50203017, 50508027, 
140102087, 70201047, 50109027, 130102012, 120715030, 130106034, 
120715037, 60901020, 60901023, 140102092, 50203030, 50203004), format.spss = "F10.0", display_width = 10L), 
    placa_rxrb_01 = structure(c(17.3644166666667, 17.4306458333333, 
    17.2125, 17.4306458333333, 16.8459791666667, 17.3644166666667, 
    17.4306458333333, 17.3336875, 17.4306458333333, 17.4459166666667, 
    17.3644166666667, 17.3644166666667, 17.3336875, 17.2125, 
    17.4306458333333), label = "lab_exp_genica: ct_cal_placa_RXRB Hs00232774_m1 Ct basal", format.spss = "F10.7", display_width = 10L), 
    placa_cyp27a1_01 = structure(c(18.1048043478261, 18.0184893617021, 
    17.9057708333333, 18.0184893617021, 17.6525625, 18.1048043478261, 
    18.0184893617021, 17.7276875, 18.0184893617021, 17.5819565217391, 
    18.1048043478261, 18.1048043478261, 17.7276875, 17.9057708333333, 
    18.0184893617021), label = "lab_exp_genica: ct_cal_placa_CYP27A1 Hs01017992_g1 Ct basal", format.spss = "F11.8", display_width = 11L), 
    placa_abca1_01 = structure(c(18.0373958333333, 17.7479791666667, 
    17.9839791666667, 17.7479791666667, 17.7181041666667, 18.0373958333333, 
    17.7479791666667, 17.8329583333333, 17.7479791666667, 17.9318260869565, 
    18.0373958333333, 18.0373958333333, 17.8329583333333, 17.9839791666667, 
    17.7479791666667), label = "lab_exp_genica: ct_cal_placa_ABCA1 Hs01059101_m1 Ct basal", format.spss = "F11.8", display_width = 11L), 
    placa_scarb1_01 = structure(c(NA, 20.2697659574468, 20.5369583333333, 
    20.2697659574468, 20.1755208333333, 20.6629111111111, 20.2697659574468, 
    20.471829787234, 20.2697659574468, 20.2979166666667, 20.6629111111111, 
    20.6629111111111, 20.471829787234, 20.5369583333333, 20.2697659574468
    ), label = "lab_exp_genica: ct_cal_placa_SCARB1 Hs00969821_m1 Ct basal", format.spss = "F10.7", display_width = 10L), 
    placa_cav1_01 = structure(c(23.9575208333333, 23.5285625, 
    24.01425, 23.5285625, 23.1588541666667, 23.9575208333333, 
    23.5285625, 23.5898125, 23.5285625, 23.513875, 23.9575208333333, 
    23.9575208333333, 23.5898125, 24.01425, 23.5285625), label = "lab_exp_genica: ct_cal_placa_CAV1 Hs00971716_m1 Ct basal", format.spss = "F10.7", display_width = 10L), 
    placa_nfkb1_01 = structure(c(16.8749583333333, 16.6195, 16.7741041666667, 
    16.6195, 16.1903125, 16.8749583333333, 16.6195, 16.608375, 
    16.6195, NA, 16.8749583333333, 16.8749583333333, 16.608375, 
    16.7741041666667, 16.6195), label = "lab_exp_genica: ct_cal_placa_NFKB1 Hs00765730_m1 Ct basal", format.spss = "F10.7", display_width = 10L), 
    placa_tgfb2_03 = structure(c(24.0563043478261, 23.6882553191489, 
    24.1700416666667, 23.6882553191489, 23.4935416666667, 24.0563043478261, 
    23.6882553191489, 24.02575, 23.6882553191489, 23.7865208333333, 
    24.0563043478261, 24.0563043478261, 24.02575, 24.1700416666667, 
    23.6882553191489), label = "lab_exp_genica: ct_cal_placa_TGFB2 Hs00234244_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L), 
    placa_ido_03 = structure(c(19.5457916666667, 19.4714375, 
    19.6080212765957, 19.4714375, 18.3972708333333, 19.5457916666667, 
    19.4714375, 19.4884166666667, 19.4714375, 19.070447368421, 
    19.5457916666667, 19.5457916666667, 19.4884166666667, 19.6080212765957, 
    19.4714375), label = "lab_exp_genica: ct_cal_placa_IDO Hs00984148_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L), 
    placa_cdkn2a_03 = structure(c(21.1584583333333, 21.0697291666667, 
    21.1403541666667, 21.0697291666667, 20.7684680851064, 21.1584583333333, 
    21.0697291666667, 21.1548125, 21.0697291666667, 21.0899189189189, 
    21.1584583333333, 21.1584583333333, 21.1548125, 21.1403541666667, 
    21.0697291666667), label = "lab_exp_genica: ct_cal_placa_CDKN2A Hs00923894_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L), 
    placa_abcg1_03 = structure(c(18.7376808510638, 18.3815957446809, 
    18.6998333333333, 18.3815957446809, 18.1750833333333, 18.7376808510638, 
    18.3815957446809, 18.5114893617021, 18.3815957446809, 18.3424375, 
    18.7376808510638, 18.7376808510638, 18.5114893617021, 18.6998333333333, 
    18.3815957446809), label = "lab_exp_genica: ct_cal_placa_ABCG1 Hs00245154_m1 Ct 1 year", format.spss = "F10.7", display_width = 10L), 
    placa_abcg4_03 = structure(c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, 27.677, NA, NA, NA, NA, NA), label = "lab_exp_genica: ct_cal_placa_ABCG4 Hs00223446_m1 Ct 1 year", format.spss = "F6.3"), 
    placa_nfe2l2_03 = structure(c(17.2223404255319, 16.8399787234043, 
    17.1038541666667, 16.8399787234043, 16.5560208333333, 17.2223404255319, 
    16.8399787234043, 16.9331063829787, 16.8399787234043, 17.1034166666667, 
    17.2223404255319, 17.2223404255319, 16.9331063829787, 17.1038541666667, 
    16.8399787234043), label = "lab_exp_genica: ct_cal_placa_NFE2L2 Hs00975961_g1 Ct 1 year", format.spss = "F10.7", display_width = 10L)), row.names = c(NA, 
-15L), class = c("tbl_df", "tbl", "data.frame"))

As you can see the pattern is placa_gen_time. I want something like this:


gen_cal     cal01         cal03
ppard      17.17521         NA
pparg      24.10186         NA
olr1        NA              23.93779      
msr1        NA              20.58086

I guess it could be done in more than one step with sthg like this (genes2 my database). But I am pretty sure that pivot_longer function allows to transform the colnames into 2 columns for values and one column for the name (=genes)


genes2 <- genes2 %>% pivot_longer(cols = matches("placa_"), names_to = c("gen_cal", "time_cal"), names_sep = "_0",values_to = "Ct_cal")

> head
 gen_cal time_cal Ct_cal
  <chr>   <chr>     <dbl>
1 ppara   1          19.7
2 ppard   1          17.2
3 pparg   1          24.1
4 nr1h3   1          22.1
5 nr1h2   1          18.4
6 rxra    1          14.3


# And then pivot_wider the column time and Ct_cal into cal01 and cal03

CodePudding user response:

We need to reshape twice, wide-to-long, split string, then reshape it again long-to-wide, using data.table:

library(data.table)

setDT(df)

dcast(
  melt(df, id.vars = "id")[, c("gene", "time") := tstrsplit(variable, split = "_", fixed = TRUE, keep = 2:3)],
      id   gene ~ time)
#             id   gene       01       03
#   1:  50109027  abca1 18.03740       NA
#   2:  50109027  abcg1       NA 18.73768
#   3:  50109027  abcg4       NA       NA
#   4:  50109027   cav1 23.95752       NA
#   5:  50109027 cdkn2a       NA 21.15846
#  ---                                   
# 176: 140102092 nfe2l2       NA 16.93311
# 177: 140102092  nfkb1 16.60837       NA
# 178: 140102092   rxrb 17.33369       NA
# 179: 140102092 scarb1 20.47183       NA
# 180: 140102092  tgfb2       NA 24.02575

CodePudding user response:

Seems like you are looking for .value in the names_to argument.

library(tidyverse)

df %>% 
  pivot_longer(starts_with("placa"), names_to = c("gene", ".value"), 
               names_pattern = "placa_(.*)_(.*)") %>% 
  setNames(c("id", "gene", "cal01", "cal03"))

Update

If we do not use setNames after the above pivot_longer, the pivoted column names would only contain numbers, which is not allowed in base R data frame (no problem for tibble).

# A tibble: 180 × 4
          id gene     `01`  `03`
       <dbl> <chr>   <dbl> <dbl>
 1 130102009 rxrb     17.4  NA

A workaround for the column names is to rename them prior to pivot_longer to attach the string "cal" before the "0".

df %>% 
  rename_with(.cols = contains("_0"), ~sub("_0", "_cal0", .x)) %>% 
  pivot_longer(starts_with("placa"), names_to = c("gene", ".value"), 
               names_pattern = "placa_(.*)_(.*)")

Output

# A tibble: 180 × 4
          id gene    cal01 cal03
       <dbl> <chr>   <dbl> <dbl>
 1 130102009 rxrb     17.4  NA  
 2 130102009 cyp27a1  18.1  NA  
 3 130102009 abca1    18.0  NA  
 4 130102009 scarb1   NA    NA  
 5 130102009 cav1     24.0  NA  
 6 130102009 nfkb1    16.9  NA  
 7 130102009 tgfb2    NA    24.1
 8 130102009 ido      NA    19.5
 9 130102009 cdkn2a   NA    21.2
10 130102009 abcg1    NA    18.7
# … with 170 more rows
# ℹ Use `print(n = ...)` to see more rows
  • Related