What I am thinking might be naive. But I want to split the rows [1:3] of df based on the second "_", using tidyr::extract()
library(tidyr)
library(dplyr)
extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$")
and the rows of df [4:6] based on the first "_"
extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
I am thinking of something like
df %>%
mutate(n=row_number())
mutate(col2=case_when
(n<=3 ~ extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$"),
n>3 ~ extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
)
Of course, this is screamingly wrong but is it possible in some way?
Example data:
df=tibble(col1 = c("2397_A_run379_CTTGTACT_S119_L004_R1_001",
"3779_A_run535_TTATAGCC_S91_L003_R1_001",
"4958_BV_run685_GCGTACGT_S89_L005_R1_001",
"5126AA_S27_L004_R1_001",
"5126AF_S32_L004_R1_001",
"5126AL_S38_L004_R1_001"))
df
#> # A tibble: 6 × 1
#> col1
#> <chr>
#> 1 2397_A_run379_CTTGTACT_S119_L004_R1_001
#> 2 3779_A_run535_TTATAGCC_S91_L003_R1_001
#> 3 4958_BV_run685_GCGTACGT_S89_L005_R1_001
#> 4 5126AA_S27_L004_R1_001
#> 5 5126AF_S32_L004_R1_001
#> 6 5126AL_S38_L004_R1_001
Created on 2022-11-17 with reprex v2.0.2
CodePudding user response:
If the pattern is to extract the substring by matching the _
the precedes one or more letters followed by digits,
library(dplyr)
library(stringr)
df %>%
mutate(col2 = str_extract(col1, "(?<=_)[A-Za-z] \\d .*"))
-output
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 S38_L004_R1_001
Or use separate
library(tidyr)
separate(df, col1, into = c("col1", "col2"),
sep = "(?<=[A-Z])_(?=[A-Za-z] \\d )", extra = "merge")
-output
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A run379_CTTGTACT_S119_L004_R1_001
2 3779_A run535_TTATAGCC_S91_L003_R1_001
3 4958_BV run685_GCGTACGT_S89_L005_R1_001
4 5126AA S27_L004_R1_001
5 5126AF S32_L004_R1_001
6 5126AL S38_L004_R1_001
CodePudding user response:
tidyr::extract()
takes and returns a dataframe, and will be tricky to use inside mutate()
. I would instead use something like stringr::str_match()
:
library(dplyr)
library(stringr)
df %>%
mutate(
row = row_number(),
col2 = case_when(
row < 4 ~ str_match(col1, ". ?_. ?_(. )")[, 2],
row < 7 ~ str_match(col1, ". ?_(. )")[, 2]
)
)
# A tibble: 6 × 3
col1 row col2
<chr> <int> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 1 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 2 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 3 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 4 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 5 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 6 S38_L004_R1_001