How to use case_when to apply different functions in dplyr-CodePudding

What I am thinking might be naive. But I want to split the rows [1:3] of df based on the second "_", using tidyr::extract()

library(tidyr)
library(dplyr)

extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$")

and the rows of df [4:6] based on the first "_"

extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")

I am thinking of something like

df %>% 
  mutate(n=row_number())
mutate(col2=case_when
  (n<=3 ~ extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$"), 
  n>3 ~ extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
)

Of course, this is screamingly wrong but is it possible in some way?

Example data:

df=tibble(col1 = c("2397_A_run379_CTTGTACT_S119_L004_R1_001", 
                   "3779_A_run535_TTATAGCC_S91_L003_R1_001", 
                   "4958_BV_run685_GCGTACGT_S89_L005_R1_001", 
                   "5126AA_S27_L004_R1_001",
                   "5126AF_S32_L004_R1_001",
"5126AL_S38_L004_R1_001"))

df
#> # A tibble: 6 × 1
#>   col1                                   
#>   <chr>                                  
#> 1 2397_A_run379_CTTGTACT_S119_L004_R1_001
#> 2 3779_A_run535_TTATAGCC_S91_L003_R1_001 
#> 3 4958_BV_run685_GCGTACGT_S89_L005_R1_001
#> 4 5126AA_S27_L004_R1_001                 
#> 5 5126AF_S32_L004_R1_001                 
#> 6 5126AL_S38_L004_R1_001

^{Created on 2022-11-17 with reprex v2.0.2}

CodePudding user response：

If the pattern is to extract the substring by matching the _ the precedes one or more letters followed by digits,

library(dplyr)
library(stringr)
df %>% 
  mutate(col2 = str_extract(col1, "(?<=_)[A-Za-z] \\d .*"))

-output

# A tibble: 6 × 2
  col1                                    col2                            
  <chr>                                   <chr>                           
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001  run535_TTATAGCC_S91_L003_R1_001 
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 run685_GCGTACGT_S89_L005_R1_001 
4 5126AA_S27_L004_R1_001                  S27_L004_R1_001                 
5 5126AF_S32_L004_R1_001                  S32_L004_R1_001                 
6 5126AL_S38_L004_R1_001                  S38_L004_R1_001

Or use separate

library(tidyr)
separate(df, col1, into = c("col1", "col2"), 
    sep = "(?<=[A-Z])_(?=[A-Za-z] \\d )", extra = "merge")

-output

# A tibble: 6 × 2
  col1    col2                            
  <chr>   <chr>                           
1 2397_A  run379_CTTGTACT_S119_L004_R1_001
2 3779_A  run535_TTATAGCC_S91_L003_R1_001 
3 4958_BV run685_GCGTACGT_S89_L005_R1_001 
4 5126AA  S27_L004_R1_001                 
5 5126AF  S32_L004_R1_001                 
6 5126AL  S38_L004_R1_001

CodePudding user response：

tidyr::extract() takes and returns a dataframe, and will be tricky to use inside mutate(). I would instead use something like stringr::str_match():

library(dplyr)
library(stringr)

df %>%
  mutate(
    row = row_number(),
    col2 = case_when(
      row < 4 ~ str_match(col1, ". ?_. ?_(. )")[, 2],
      row < 7 ~ str_match(col1, ". ?_(. )")[, 2]
    )
  )

# A tibble: 6 × 3
  col1                                      row col2                            
  <chr>                                   <int> <chr>                           
1 2397_A_run379_CTTGTACT_S119_L004_R1_001     1 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001      2 run535_TTATAGCC_S91_L003_R1_001 
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001     3 run685_GCGTACGT_S89_L005_R1_001 
4 5126AA_S27_L004_R1_001                      4 S27_L004_R1_001                 
5 5126AF_S32_L004_R1_001                      5 S32_L004_R1_001                 
6 5126AL_S38_L004_R1_001                      6 S38_L004_R1_001