I have a data with 2 variables var
and text
. I need to remove the partial matching from text
and save it. I have the sample of the result.
dt <- data.frame(var = c("OCILY10","SUDHL16","u2932"),
text = c("OCILY-10-Cas9_T12", "SU-DHL-16_T12_vs_T0","U2932_T10-122-SEMI-A"))
> dt
var text
1 OCILY10 OCILY-10-Cas9_T12
2 SUDHL16 SU-DHL-16_T12_vs_T0
3 u2932 U2932_T10-122-SEMI-A
#======================
# Result
dt <- data.frame(var = c("OCILY10","SUDHL16","u2932"),
text = c("OCILY-10-Cas9_T12", "SU-DHL-16_T12_vs_T0","U2932_T10-122-SEMI-A"),
result = c("Cas9_T12", "T12_vs_T0","T10-122-SEMI-A"))
> dt
var text result
1 OCILY10 OCILY-10-Cas9_T12 Cas9_T12
2 SUDHL16 SU-DHL-16_T12_vs_T0 T12_vs_T0
3 u2932 U2932_T10-122-SEMI-A T10-122-SEMI-A
CodePudding user response:
Borrowing from @Akrun's comment, here's a dplyr
solution in steps:
library(dplyr)
library(stringr)
dt %>%
mutate(
# create temporary column with two first and two last characters from `var`;
# add `.*` between them as regex metacharacters (@Akrun's ingenious idea!):
temp = str_replace(var, "^(..).*(..)$", "\\1.*\\2"),
# make `temp` a case-insensitive pattern und replace what it matches in `text` with nothing:
result = str_replace(text, paste0("(?i)", temp), ""),
# remove leading `-` and `_`:
result = str_replace(result, "^(-|_)", "")) %>%
# remove temporary column:
select(-temp)
var text result
1 OCILY10 OCILY-10-Cas9_T12 Cas9_T12
2 SUDHL16 SU-DHL-16_T12_vs_T0 T12_vs_T0
3 u2932 U2932_T10-122-SEMI-A T10-122-SEMI-A
CodePudding user response:
Another tidyverse solution.
library(tidyverse)
dt %>%
mutate(split = str_split(text, '(?<=[_-])')) %>%
unnest(split) %>%
mutate(sieve = str_remove(split, '_|-')) %>%
filter(!str_detect(var, str_c('(?i)', sieve))) %>%
group_by(var) %>%
mutate(result = str_c(split, collapse = '')) %>%
distinct(var, text, result)
# # A tibble: 3 x 3
# # Groups: var [3]
# var text result
# <chr> <chr> <chr>
# 1 OCILY10 OCILY-10-Cas9_T12 Cas9_T12
# 2 SUDHL16 SU-DHL-16_T12_vs_T0 T12_vs_T0
# 3 u2932 U2932_T10-122-SEMI-A T10-122-SEMI-A