Home > Enterprise >  How to extract information from string column in R?
How to extract information from string column in R?

Time:11-24

I have the following dataframe:

df <- structure(list(Page = c("/es/import-340600-to-mx-from-de/summary", 
"/es/import-340600-to-de-from-mx/summary", "/es/import-071320-to-sv-from-cr/summary", 
"/en/import-340111-to-ru-from-ir/summary", "/en/import-870423-to-hk-from-de/summary", 
"/es/import-392049-to-mx-from-de/summary", "/es/import-080440-to-mx-from-es/summary", 
"/es/import-340600-to-mx-from-jp/summary", "/en/import-852691-to-tr-from-ua/summary", 
"/es/import-180620-to-mx-from-us/summary"), Count = c(153, 78, 
72, 58, 57, 55, 48, 46, 42, 42)), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -10L))

   Page                                    Count
   <chr>                                   <dbl>
 1 /es/import-340600-to-mx-from-de/summary   153
 2 /es/import-340600-to-de-from-mx/summary    78
 3 /es/import-071320-to-sv-from-cr/summary    72
 4 /en/import-340111-to-ru-from-ir/summary    58
 5 /en/import-870423-to-hk-from-de/summary    57
 6 /es/import-392049-to-mx-from-de/summary    55
 7 /es/import-080440-to-mx-from-es/summary    48
 8 /es/import-340600-to-mx-from-jp/summary    46
 9 /en/import-852691-to-tr-from-ua/summary    42
10 /es/import-180620-to-mx-from-us/summary    42

For example, from this row below,how can I extract and put mx and de into 2 separate columns? And do the same for the rest of the table

1 /es/import-340600-to-mx-from-de/summary   153

Expected output:

enter image description here

......

CodePudding user response:

We can use strcapture for this: it takes a regex with one or more (.) groups, a vector, and returns a data.frame based on the fields/classes present in proto=. If any of the groups are not found, none of them are found.

base R

cbind(df, strcapture(".*-to-([^-/]*)-from-([^-/]*).*", df$Page, proto=list(to="", from="")))
#                                       Page Count to from
# 1  /es/import-340600-to-mx-from-de/summary   153 mx   de
# 2  /es/import-340600-to-de-from-mx/summary    78 de   mx
# 3  /es/import-071320-to-sv-from-cr/summary    72 sv   cr
# 4  /en/import-340111-to-ru-from-ir/summary    58 ru   ir
# 5  /en/import-870423-to-hk-from-de/summary    57 hk   de
# 6  /es/import-392049-to-mx-from-de/summary    55 mx   de
# 7  /es/import-080440-to-mx-from-es/summary    48 mx   es
# 8  /es/import-340600-to-mx-from-jp/summary    46 mx   jp
# 9  /en/import-852691-to-tr-from-ua/summary    42 tr   ua
# 10 /es/import-180620-to-mx-from-us/summary    42 mx   us

dplyr

library(dplyr)
df %>%
  mutate(strcapture(".*-to-([^-/]*)-from-([^-/]*).*", Page, list(to="", from="")))
# # A tibble: 10 x 4
#    Page                                    Count to    from 
#    <chr>                                   <dbl> <chr> <chr>
#  1 /es/import-340600-to-mx-from-de/summary   153 mx    de   
#  2 /es/import-340600-to-de-from-mx/summary    78 de    mx   
#  3 /es/import-071320-to-sv-from-cr/summary    72 sv    cr   
#  4 /en/import-340111-to-ru-from-ir/summary    58 ru    ir   
#  5 /en/import-870423-to-hk-from-de/summary    57 hk    de   
#  6 /es/import-392049-to-mx-from-de/summary    55 mx    de   
#  7 /es/import-080440-to-mx-from-es/summary    48 mx    es   
#  8 /es/import-340600-to-mx-from-jp/summary    46 mx    jp   
#  9 /en/import-852691-to-tr-from-ua/summary    42 tr    ua   
# 10 /es/import-180620-to-mx-from-us/summary    42 mx    us   

If you need to catch one if the other is not present, then perhaps:

df$Page[2] <- "/es/import-340600-TO-de-from-mx/summary" # "TO" not "to"
df %>%
  mutate(to = stringr::str_match(Page, ".*-to-([^/-]*)")[,2], from = stringr::str_match(Page, ".*-from-([^/-]*)")[,2])
# # A tibble: 10 x 4
#    Page                                    Count to    from 
#    <chr>                                   <dbl> <chr> <chr>
#  1 /es/import-340600-to-mx-from-de/summary   153 mx    de   
#  2 /es/import-340600-TO-de-from-mx/summary    78 NA    mx   
#  3 /es/import-071320-to-sv-from-cr/summary    72 sv    cr   
#  4 /en/import-340111-to-ru-from-ir/summary    58 ru    ir   
#  5 /en/import-870423-to-hk-from-de/summary    57 hk    de   
#  6 /es/import-392049-to-mx-from-de/summary    55 mx    de   
#  7 /es/import-080440-to-mx-from-es/summary    48 mx    es   
#  8 /es/import-340600-to-mx-from-jp/summary    46 mx    jp   
#  9 /en/import-852691-to-tr-from-ua/summary    42 tr    ua   
# 10 /es/import-180620-to-mx-from-us/summary    42 mx    us   

or more generically (one or more such patterns):

df %>%
  mutate(as.data.frame(lapply(
    setNames(nm = c("to", "from")),
    function(ptn) stringr::str_match(Page, sprintf(".*-%s-([^/-]*)", ptn))[,2])
  ))
# # A tibble: 10 x 4
#    Page                                    Count to    from 
#    <chr>                                   <dbl> <chr> <chr>
#  1 /es/import-340600-to-mx-from-de/summary   153 mx    de   
#  2 /es/import-340600-TO-de-from-mx/summary    78 NA    mx   
#  3 /es/import-071320-to-sv-from-cr/summary    72 sv    cr   
#  4 /en/import-340111-to-ru-from-ir/summary    58 ru    ir   
#  5 /en/import-870423-to-hk-from-de/summary    57 hk    de   
#  6 /es/import-392049-to-mx-from-de/summary    55 mx    de   
#  7 /es/import-080440-to-mx-from-es/summary    48 mx    es   
#  8 /es/import-340600-to-mx-from-jp/summary    46 mx    jp   
#  9 /en/import-852691-to-tr-from-ua/summary    42 tr    ua   
# 10 /es/import-180620-to-mx-from-us/summary    42 mx    us   

CodePudding user response:

Another dplyr option:

library(dplyr)

df %>%
  mutate(from = sub("^.*from-(.*)/.*$", "\\1", Page),
         to   = sub("^.*to-(.*?)-.*$", "\\1", Page)) %>%
  select(-Page)
#> # A tibble: 10 x 3
#>    Count from  to   
#>    <dbl> <chr> <chr>
#>  1   153 de    mx   
#>  2    78 mx    de   
#>  3    72 cr    sv   
#>  4    58 ir    ru   
#>  5    57 de    hk   
#>  6    55 de    mx   
#>  7    48 es    mx   
#>  8    46 jp    mx   
#>  9    42 ua    tr   
#> 10    42 us    mx

Created on 2022-11-23 with reprex v2.0.2

CodePudding user response:

Another solution:

df %>%
  mutate(To = str_extract(Page, "(?<=to-)\\w "),
         From = str_extract(Page, "(?<=from-)\\w "))

# A tibble: 10 × 4
   Page                                    Count To    From 
   <chr>                                   <dbl> <chr> <chr>
 1 /es/import-340600-to-mx-from-de/summary   153 mx    de   
 2 /es/import-340600-to-de-from-mx/summary    78 de    mx   
 3 /es/import-071320-to-sv-from-cr/summary    72 sv    cr   
 4 /en/import-340111-to-ru-from-ir/summary    58 ru    ir   
 5 /en/import-870423-to-hk-from-de/summary    57 hk    de   
 6 /es/import-392049-to-mx-from-de/summary    55 mx    de   
 7 /es/import-080440-to-mx-from-es/summary    48 mx    es   
 8 /es/import-340600-to-mx-from-jp/summary    46 mx    jp   
 9 /en/import-852691-to-tr-from-ua/summary    42 tr    ua   
10 /es/import-180620-to-mx-from-us/summary    42 mx    us 

or even:

df %>%
  extract(Page, c("to","from"), "to-(\\w )-from-(\\w )", remove = FALSE)

# A tibble: 10 × 4
   Page                                    to    from  Count
   <chr>                                   <chr> <chr> <dbl>
 1 /es/import-340600-to-mx-from-de/summary mx    de      153
 2 /es/import-340600-to-de-from-mx/summary de    mx       78
 3 /es/import-071320-to-sv-from-cr/summary sv    cr       72
 4 /en/import-340111-to-ru-from-ir/summary ru    ir       58
 5 /en/import-870423-to-hk-from-de/summary hk    de       57
 6 /es/import-392049-to-mx-from-de/summary mx    de       55
 7 /es/import-080440-to-mx-from-es/summary mx    es       48
 8 /es/import-340600-to-mx-from-jp/summary mx    jp       46
 9 /en/import-852691-to-tr-from-ua/summary tr    ua       42
10 /es/import-180620-to-mx-from-us/summary mx    us       42
  • Related