I have the following dataframe:
df <- structure(list(Page = c("/es/import-340600-to-mx-from-de/summary",
"/es/import-340600-to-de-from-mx/summary", "/es/import-071320-to-sv-from-cr/summary",
"/en/import-340111-to-ru-from-ir/summary", "/en/import-870423-to-hk-from-de/summary",
"/es/import-392049-to-mx-from-de/summary", "/es/import-080440-to-mx-from-es/summary",
"/es/import-340600-to-mx-from-jp/summary", "/en/import-852691-to-tr-from-ua/summary",
"/es/import-180620-to-mx-from-us/summary"), Count = c(153, 78,
72, 58, 57, 55, 48, 46, 42, 42)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -10L))
Page Count
<chr> <dbl>
1 /es/import-340600-to-mx-from-de/summary 153
2 /es/import-340600-to-de-from-mx/summary 78
3 /es/import-071320-to-sv-from-cr/summary 72
4 /en/import-340111-to-ru-from-ir/summary 58
5 /en/import-870423-to-hk-from-de/summary 57
6 /es/import-392049-to-mx-from-de/summary 55
7 /es/import-080440-to-mx-from-es/summary 48
8 /es/import-340600-to-mx-from-jp/summary 46
9 /en/import-852691-to-tr-from-ua/summary 42
10 /es/import-180620-to-mx-from-us/summary 42
For example, from this row below,how can I extract and put mx and de into 2 separate columns? And do the same for the rest of the table
1 /es/import-340600-to-mx-from-de/summary 153
Expected output:
......
CodePudding user response:
We can use strcapture
for this: it takes a regex with one or more (.)
groups, a vector, and returns a data.frame
based on the fields/classes present in proto=
. If any of the groups are not found, none of them are found.
base R
cbind(df, strcapture(".*-to-([^-/]*)-from-([^-/]*).*", df$Page, proto=list(to="", from="")))
# Page Count to from
# 1 /es/import-340600-to-mx-from-de/summary 153 mx de
# 2 /es/import-340600-to-de-from-mx/summary 78 de mx
# 3 /es/import-071320-to-sv-from-cr/summary 72 sv cr
# 4 /en/import-340111-to-ru-from-ir/summary 58 ru ir
# 5 /en/import-870423-to-hk-from-de/summary 57 hk de
# 6 /es/import-392049-to-mx-from-de/summary 55 mx de
# 7 /es/import-080440-to-mx-from-es/summary 48 mx es
# 8 /es/import-340600-to-mx-from-jp/summary 46 mx jp
# 9 /en/import-852691-to-tr-from-ua/summary 42 tr ua
# 10 /es/import-180620-to-mx-from-us/summary 42 mx us
dplyr
library(dplyr)
df %>%
mutate(strcapture(".*-to-([^-/]*)-from-([^-/]*).*", Page, list(to="", from="")))
# # A tibble: 10 x 4
# Page Count to from
# <chr> <dbl> <chr> <chr>
# 1 /es/import-340600-to-mx-from-de/summary 153 mx de
# 2 /es/import-340600-to-de-from-mx/summary 78 de mx
# 3 /es/import-071320-to-sv-from-cr/summary 72 sv cr
# 4 /en/import-340111-to-ru-from-ir/summary 58 ru ir
# 5 /en/import-870423-to-hk-from-de/summary 57 hk de
# 6 /es/import-392049-to-mx-from-de/summary 55 mx de
# 7 /es/import-080440-to-mx-from-es/summary 48 mx es
# 8 /es/import-340600-to-mx-from-jp/summary 46 mx jp
# 9 /en/import-852691-to-tr-from-ua/summary 42 tr ua
# 10 /es/import-180620-to-mx-from-us/summary 42 mx us
If you need to catch one if the other is not present, then perhaps:
df$Page[2] <- "/es/import-340600-TO-de-from-mx/summary" # "TO" not "to"
df %>%
mutate(to = stringr::str_match(Page, ".*-to-([^/-]*)")[,2], from = stringr::str_match(Page, ".*-from-([^/-]*)")[,2])
# # A tibble: 10 x 4
# Page Count to from
# <chr> <dbl> <chr> <chr>
# 1 /es/import-340600-to-mx-from-de/summary 153 mx de
# 2 /es/import-340600-TO-de-from-mx/summary 78 NA mx
# 3 /es/import-071320-to-sv-from-cr/summary 72 sv cr
# 4 /en/import-340111-to-ru-from-ir/summary 58 ru ir
# 5 /en/import-870423-to-hk-from-de/summary 57 hk de
# 6 /es/import-392049-to-mx-from-de/summary 55 mx de
# 7 /es/import-080440-to-mx-from-es/summary 48 mx es
# 8 /es/import-340600-to-mx-from-jp/summary 46 mx jp
# 9 /en/import-852691-to-tr-from-ua/summary 42 tr ua
# 10 /es/import-180620-to-mx-from-us/summary 42 mx us
or more generically (one or more such patterns):
df %>%
mutate(as.data.frame(lapply(
setNames(nm = c("to", "from")),
function(ptn) stringr::str_match(Page, sprintf(".*-%s-([^/-]*)", ptn))[,2])
))
# # A tibble: 10 x 4
# Page Count to from
# <chr> <dbl> <chr> <chr>
# 1 /es/import-340600-to-mx-from-de/summary 153 mx de
# 2 /es/import-340600-TO-de-from-mx/summary 78 NA mx
# 3 /es/import-071320-to-sv-from-cr/summary 72 sv cr
# 4 /en/import-340111-to-ru-from-ir/summary 58 ru ir
# 5 /en/import-870423-to-hk-from-de/summary 57 hk de
# 6 /es/import-392049-to-mx-from-de/summary 55 mx de
# 7 /es/import-080440-to-mx-from-es/summary 48 mx es
# 8 /es/import-340600-to-mx-from-jp/summary 46 mx jp
# 9 /en/import-852691-to-tr-from-ua/summary 42 tr ua
# 10 /es/import-180620-to-mx-from-us/summary 42 mx us
CodePudding user response:
Another dplyr option:
library(dplyr)
df %>%
mutate(from = sub("^.*from-(.*)/.*$", "\\1", Page),
to = sub("^.*to-(.*?)-.*$", "\\1", Page)) %>%
select(-Page)
#> # A tibble: 10 x 3
#> Count from to
#> <dbl> <chr> <chr>
#> 1 153 de mx
#> 2 78 mx de
#> 3 72 cr sv
#> 4 58 ir ru
#> 5 57 de hk
#> 6 55 de mx
#> 7 48 es mx
#> 8 46 jp mx
#> 9 42 ua tr
#> 10 42 us mx
Created on 2022-11-23 with reprex v2.0.2
CodePudding user response:
Another solution:
df %>%
mutate(To = str_extract(Page, "(?<=to-)\\w "),
From = str_extract(Page, "(?<=from-)\\w "))
# A tibble: 10 × 4
Page Count To From
<chr> <dbl> <chr> <chr>
1 /es/import-340600-to-mx-from-de/summary 153 mx de
2 /es/import-340600-to-de-from-mx/summary 78 de mx
3 /es/import-071320-to-sv-from-cr/summary 72 sv cr
4 /en/import-340111-to-ru-from-ir/summary 58 ru ir
5 /en/import-870423-to-hk-from-de/summary 57 hk de
6 /es/import-392049-to-mx-from-de/summary 55 mx de
7 /es/import-080440-to-mx-from-es/summary 48 mx es
8 /es/import-340600-to-mx-from-jp/summary 46 mx jp
9 /en/import-852691-to-tr-from-ua/summary 42 tr ua
10 /es/import-180620-to-mx-from-us/summary 42 mx us
or even:
df %>%
extract(Page, c("to","from"), "to-(\\w )-from-(\\w )", remove = FALSE)
# A tibble: 10 × 4
Page to from Count
<chr> <chr> <chr> <dbl>
1 /es/import-340600-to-mx-from-de/summary mx de 153
2 /es/import-340600-to-de-from-mx/summary de mx 78
3 /es/import-071320-to-sv-from-cr/summary sv cr 72
4 /en/import-340111-to-ru-from-ir/summary ru ir 58
5 /en/import-870423-to-hk-from-de/summary hk de 57
6 /es/import-392049-to-mx-from-de/summary mx de 55
7 /es/import-080440-to-mx-from-es/summary mx es 48
8 /es/import-340600-to-mx-from-jp/summary mx jp 46
9 /en/import-852691-to-tr-from-ua/summary tr ua 42
10 /es/import-180620-to-mx-from-us/summary mx us 42