I am a bit puzzled by separating columns using the separate function in tidyr:
My data frame looks like this
library(tidyverse)
df=tibble(col1=c("2397_A_run379_CTTGTACT_S119_L004_R1_001",
"3779_A_run535_TTATAGCC_S91_L003_R1_001",
"4958_BV_run685_GCGTACGT_S89_L005_R1_001"))
df
#> # A tibble: 3 × 1
#> col1
#> <chr>
#> 1 2397_A_run379_CTTGTACT_S119_L004_R1_001
#> 2 3779_A_run535_TTATAGCC_S91_L003_R1_001
#> 3 4958_BV_run685_GCGTACGT_S89_L005_R1_001
Created on 2022-11-11 with reprex v2.0.2
I want to create three dataframes where I separate everything based on the first "_", then based on the second "_" and then by the third "_".
I want something like this:
> df1
col1 col2
2397 A_run379_CTTGTACT_S119_L004_R1_001
3779 A_run535_TTATAGCC_S91_L003_R1_001
4958 BV_run685_GCGTACGT_S89_L005_R1_001
> df2
col1 col2
2397_A run379_CTTGTACT_S119_L004_R1_001
3779_A run535_TTATAGCC_S91_L003_R1_001
4958_BV run685_GCGTACGT_S89_L005_R1_001
> df3
col1 col2
2397_A_run379 CTTGTACT_S119_L004_R1_001
3779_A_run535 TTATAGCC_S91_L003_R1_001
4958_BV_run685 GCGTACGT_S89_L005_R1_001
Any help or guidance are appreciated.
CodePudding user response:
Split all by "_"
, and paste it back as desired:
s <- strsplit(df$col1, "_")
lapply(1:3, function(i)
do.call(rbind,
lapply(s, function(j){
data.frame(
col1 = paste(j[ 1:i ], collapse = "_"),
col2 = paste(j[ (i 1):length(j) ], collapse = "_")
)
})))
# [[1]]
# col1 col2
# 1 2397 A_run379_CTTGTACT_S119_L004_R1_001
# 2 3779 A_run535_TTATAGCC_S91_L003_R1_001
# 3 4958 BV_run685_GCGTACGT_S89_L005_R1_001
#
# [[2]]
# col1 col2
# 1 2397_A run379_CTTGTACT_S119_L004_R1_001
# 2 3779_A run535_TTATAGCC_S91_L003_R1_001
# 3 4958_BV run685_GCGTACGT_S89_L005_R1_001
#
# [[3]]
# col1 col2
# 1 2397_A_run379 CTTGTACT_S119_L004_R1_001
# 2 3779_A_run535 TTATAGCC_S91_L003_R1_001
# 3 4958_BV_run685 GCGTACGT_S89_L005_R1_001
CodePudding user response:
Perhaps there is some regex magic but a "brute-force" approach using tidyr::extract
may look like so:
library(tidyverse)
df |>
extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
#> # A tibble: 3 × 2
#> col1 col2
#> <chr> <chr>
#> 1 2397 A_run379_CTTGTACT_S119_L004_R1_001
#> 2 3779 A_run535_TTATAGCC_S91_L003_R1_001
#> 3 4958 BV_run685_GCGTACGT_S89_L005_R1_001
df |>
extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$")
#> # A tibble: 3 × 2
#> col1 col2
#> <chr> <chr>
#> 1 2397_A run379_CTTGTACT_S119_L004_R1_001
#> 2 3779_A run535_TTATAGCC_S91_L003_R1_001
#> 3 4958_BV run685_GCGTACGT_S89_L005_R1_001
df |>
extract(col1, into = c("col1", "col2"), "^(.*?_.*?_.*?)_(.*)$")
#> # A tibble: 3 × 2
#> col1 col2
#> <chr> <chr>
#> 1 2397_A_run379 CTTGTACT_S119_L004_R1_001
#> 2 3779_A_run535 TTATAGCC_S91_L003_R1_001
#> 3 4958_BV_run685 GCGTACGT_S89_L005_R1_001