This is how my data looks like with columns matching a gen ID and a timepoint (_01 or _03)
id sample_name placa_ppard_01 placa_pparg_01 placa_nr1h3_01
1 50109018 A2535500018000006001 17.17521 24.10186 22.05157
2 50109019 A2535500019000006001 17.17521 24.10186 22.05157
3 50109025 A2535500025000006001 17.17521 24.10186 22.05157
4 50109026 A2535500026000006001 17.17521 24.10186 22.05157
5 50109027 A2535500027000006001 17.33383 24.38065 21.98426
6 50118001 A2536300001000006001 17.17521 24.10186 22.05157
placa_nr1h2_01 placa_ldlr_03 placa_pcsk9_03 placa_olr1_03 placa_msr1_03
1 18.41557 18.89107 NA 23.93779 20.58086
2 18.41557 18.89107 NA 23.93779 20.58086
3 18.41557 18.89107 NA 23.93779 20.58086
4 18.41557 18.89107 NA 23.93779 20.58086
5 18.37935 19.12857 NA 23.22024 20.36100
6 18.41557 18.89107 NA 23.93779 20.58086
### a small piece of data
df <- structure(list(id = structure(c(130102009, 50203017, 50508027,
140102087, 70201047, 50109027, 130102012, 120715030, 130106034,
120715037, 60901020, 60901023, 140102092, 50203030, 50203004), format.spss = "F10.0", display_width = 10L),
placa_rxrb_01 = structure(c(17.3644166666667, 17.4306458333333,
17.2125, 17.4306458333333, 16.8459791666667, 17.3644166666667,
17.4306458333333, 17.3336875, 17.4306458333333, 17.4459166666667,
17.3644166666667, 17.3644166666667, 17.3336875, 17.2125,
17.4306458333333), label = "lab_exp_genica: ct_cal_placa_RXRB Hs00232774_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_cyp27a1_01 = structure(c(18.1048043478261, 18.0184893617021,
17.9057708333333, 18.0184893617021, 17.6525625, 18.1048043478261,
18.0184893617021, 17.7276875, 18.0184893617021, 17.5819565217391,
18.1048043478261, 18.1048043478261, 17.7276875, 17.9057708333333,
18.0184893617021), label = "lab_exp_genica: ct_cal_placa_CYP27A1 Hs01017992_g1 Ct basal", format.spss = "F11.8", display_width = 11L),
placa_abca1_01 = structure(c(18.0373958333333, 17.7479791666667,
17.9839791666667, 17.7479791666667, 17.7181041666667, 18.0373958333333,
17.7479791666667, 17.8329583333333, 17.7479791666667, 17.9318260869565,
18.0373958333333, 18.0373958333333, 17.8329583333333, 17.9839791666667,
17.7479791666667), label = "lab_exp_genica: ct_cal_placa_ABCA1 Hs01059101_m1 Ct basal", format.spss = "F11.8", display_width = 11L),
placa_scarb1_01 = structure(c(NA, 20.2697659574468, 20.5369583333333,
20.2697659574468, 20.1755208333333, 20.6629111111111, 20.2697659574468,
20.471829787234, 20.2697659574468, 20.2979166666667, 20.6629111111111,
20.6629111111111, 20.471829787234, 20.5369583333333, 20.2697659574468
), label = "lab_exp_genica: ct_cal_placa_SCARB1 Hs00969821_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_cav1_01 = structure(c(23.9575208333333, 23.5285625,
24.01425, 23.5285625, 23.1588541666667, 23.9575208333333,
23.5285625, 23.5898125, 23.5285625, 23.513875, 23.9575208333333,
23.9575208333333, 23.5898125, 24.01425, 23.5285625), label = "lab_exp_genica: ct_cal_placa_CAV1 Hs00971716_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_nfkb1_01 = structure(c(16.8749583333333, 16.6195, 16.7741041666667,
16.6195, 16.1903125, 16.8749583333333, 16.6195, 16.608375,
16.6195, NA, 16.8749583333333, 16.8749583333333, 16.608375,
16.7741041666667, 16.6195), label = "lab_exp_genica: ct_cal_placa_NFKB1 Hs00765730_m1 Ct basal", format.spss = "F10.7", display_width = 10L),
placa_tgfb2_03 = structure(c(24.0563043478261, 23.6882553191489,
24.1700416666667, 23.6882553191489, 23.4935416666667, 24.0563043478261,
23.6882553191489, 24.02575, 23.6882553191489, 23.7865208333333,
24.0563043478261, 24.0563043478261, 24.02575, 24.1700416666667,
23.6882553191489), label = "lab_exp_genica: ct_cal_placa_TGFB2 Hs00234244_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_ido_03 = structure(c(19.5457916666667, 19.4714375,
19.6080212765957, 19.4714375, 18.3972708333333, 19.5457916666667,
19.4714375, 19.4884166666667, 19.4714375, 19.070447368421,
19.5457916666667, 19.5457916666667, 19.4884166666667, 19.6080212765957,
19.4714375), label = "lab_exp_genica: ct_cal_placa_IDO Hs00984148_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_cdkn2a_03 = structure(c(21.1584583333333, 21.0697291666667,
21.1403541666667, 21.0697291666667, 20.7684680851064, 21.1584583333333,
21.0697291666667, 21.1548125, 21.0697291666667, 21.0899189189189,
21.1584583333333, 21.1584583333333, 21.1548125, 21.1403541666667,
21.0697291666667), label = "lab_exp_genica: ct_cal_placa_CDKN2A Hs00923894_m1 Ct 1 year", format.spss = "F11.8", display_width = 11L),
placa_abcg1_03 = structure(c(18.7376808510638, 18.3815957446809,
18.6998333333333, 18.3815957446809, 18.1750833333333, 18.7376808510638,
18.3815957446809, 18.5114893617021, 18.3815957446809, 18.3424375,
18.7376808510638, 18.7376808510638, 18.5114893617021, 18.6998333333333,
18.3815957446809), label = "lab_exp_genica: ct_cal_placa_ABCG1 Hs00245154_m1 Ct 1 year", format.spss = "F10.7", display_width = 10L),
placa_abcg4_03 = structure(c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, 27.677, NA, NA, NA, NA, NA), label = "lab_exp_genica: ct_cal_placa_ABCG4 Hs00223446_m1 Ct 1 year", format.spss = "F6.3"),
placa_nfe2l2_03 = structure(c(17.2223404255319, 16.8399787234043,
17.1038541666667, 16.8399787234043, 16.5560208333333, 17.2223404255319,
16.8399787234043, 16.9331063829787, 16.8399787234043, 17.1034166666667,
17.2223404255319, 17.2223404255319, 16.9331063829787, 17.1038541666667,
16.8399787234043), label = "lab_exp_genica: ct_cal_placa_NFE2L2 Hs00975961_g1 Ct 1 year", format.spss = "F10.7", display_width = 10L)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
As you can see the pattern is placa_gen_time. I want something like this:
gen_cal cal01 cal03
ppard 17.17521 NA
pparg 24.10186 NA
olr1 NA 23.93779
msr1 NA 20.58086
I guess it could be done in more than one step with sthg like this (genes2 my database). But I am pretty sure that pivot_longer function allows to transform the colnames into 2 columns for values and one column for the name (=genes)
genes2 <- genes2 %>% pivot_longer(cols = matches("placa_"), names_to = c("gen_cal", "time_cal"), names_sep = "_0",values_to = "Ct_cal")
> head
gen_cal time_cal Ct_cal
<chr> <chr> <dbl>
1 ppara 1 19.7
2 ppard 1 17.2
3 pparg 1 24.1
4 nr1h3 1 22.1
5 nr1h2 1 18.4
6 rxra 1 14.3
# And then pivot_wider the column time and Ct_cal into cal01 and cal03
CodePudding user response:
We need to reshape twice, wide-to-long, split string, then reshape it again long-to-wide, using data.table:
library(data.table)
setDT(df)
dcast(
melt(df, id.vars = "id")[, c("gene", "time") := tstrsplit(variable, split = "_", fixed = TRUE, keep = 2:3)],
id gene ~ time)
# id gene 01 03
# 1: 50109027 abca1 18.03740 NA
# 2: 50109027 abcg1 NA 18.73768
# 3: 50109027 abcg4 NA NA
# 4: 50109027 cav1 23.95752 NA
# 5: 50109027 cdkn2a NA 21.15846
# ---
# 176: 140102092 nfe2l2 NA 16.93311
# 177: 140102092 nfkb1 16.60837 NA
# 178: 140102092 rxrb 17.33369 NA
# 179: 140102092 scarb1 20.47183 NA
# 180: 140102092 tgfb2 NA 24.02575
CodePudding user response:
Seems like you are looking for .value
in the names_to
argument.
library(tidyverse)
df %>%
pivot_longer(starts_with("placa"), names_to = c("gene", ".value"),
names_pattern = "placa_(.*)_(.*)") %>%
setNames(c("id", "gene", "cal01", "cal03"))
Update
If we do not use setNames
after the above pivot_longer
, the pivoted column names would only contain numbers, which is not allowed in base R data frame (no problem for tibble
).
# A tibble: 180 × 4
id gene `01` `03`
<dbl> <chr> <dbl> <dbl>
1 130102009 rxrb 17.4 NA
A workaround for the column names is to rename
them prior to pivot_longer
to attach the string "cal" before the "0".
df %>%
rename_with(.cols = contains("_0"), ~sub("_0", "_cal0", .x)) %>%
pivot_longer(starts_with("placa"), names_to = c("gene", ".value"),
names_pattern = "placa_(.*)_(.*)")
Output
# A tibble: 180 × 4
id gene cal01 cal03
<dbl> <chr> <dbl> <dbl>
1 130102009 rxrb 17.4 NA
2 130102009 cyp27a1 18.1 NA
3 130102009 abca1 18.0 NA
4 130102009 scarb1 NA NA
5 130102009 cav1 24.0 NA
6 130102009 nfkb1 16.9 NA
7 130102009 tgfb2 NA 24.1
8 130102009 ido NA 19.5
9 130102009 cdkn2a NA 21.2
10 130102009 abcg1 NA 18.7
# … with 170 more rows
# ℹ Use `print(n = ...)` to see more rows