I'm trying to get common characters from two separate vectors.
Example:
x <- c("abcde")
y <- c("efghi")
df <- data.frame(x, y)
Desired output
x y z
abcde efghi e
lmnop uvmxw m
I've tried something like this, but it is a bad result:
df |> mutate(m = unique(x, y))
If there are multiple matches, returning a list would work great.
CodePudding user response:
str_intersect <- function(s1,s2) {
paste0(intersect(strsplit(s1,"")[[1]],strsplit(s2,"")[[1]]),collapse = "")
}
x <- c("abcde","abc")
y <- c("efghi","b")
df <- data.frame(x, y)
library(dplyr)
df %>%
rowwise() %>%
mutate(m = str_intersect(x,y))
CodePudding user response:
Using R base approach:
> df$z <- intersect(unlist(strsplit(df$x, "")), unlist(strsplit(df$y, "")))
> df
x y z
1 abcde efghi e
2 lmnop uvmxw m
Data
structure(list(x = c("abcde", "lmnop"), y = c("efghi", "uvmxw"
), z = c("e", "m")), row.names = c(NA, -2L), class = "data.frame")
CodePudding user response:
Here's a tidyverse
solution with functions from stringr
, which can also handle multiple common characters:
library(stringr)
df %>%
mutate(
# convert `x` to alternation pattern:
y1 = str_replace_all(x, "(?<=.)(?=.)", "|"),
# which of `y1` are contained in `x`?:
match = str_extract_all(y, y1)
)
x y y1 match
1 abcde efghi a|b|c|d|e e
2 lmnop ovmxw l|m|n|o|p o, m
You can remove y1
by adding %>% select(-y1)
Data:
x <- c("abcde", "lmnop")
y <- c("efghi", "ovmxw")
df <- data.frame(x, y)
CodePudding user response:
Here is one method where we update the 'y' column by wrapping it inside the []
and add the ^
so that all those characters other than those will be matched as pattern
and gets removed with str_remove_all
library(stringr)
library(dplyr)
df %>%
mutate(z = str_remove_all(x, sprintf("[^%s]", y)))
-output
x y z
1 abcde efghi e
2 lmnop uvmxw m
It also handles multiple characters,
df1 %>%
mutate(z = str_remove_all(x, sprintf("[^%s]", y)))
x y z
1 abcde efghi e
2 lmnop ovmxw mo
data
df <- structure(list(x = c("abcde", "lmnop"), y = c("efghi", "uvmxw"
)), row.names = c(NA, -2L), class = "data.frame")
df1 <- structure(list(x = c("abcde", "lmnop"), y = c("efghi", "ovmxw"
)), class = "data.frame", row.names = c(NA, -2L))
CodePudding user response:
Using strsplit
and intersect
in a function with some case handling.
strintr <- \(x) {
o <- apply(x, 1, \(.) do.call(intersect, strsplit(., '')))
dx <- dim(x)[1]
if (!identical(o, dx)) length(o) <- dx
o[lengths(o) == 0L] <- NA_character_
if (any(lengths(o) > 1L)) lapply(o, as.list) else o
}
Usage
cols <- c('x', 'y')
Using within
.
within(df1, foo <- strintr(df1[cols]))
# x y foo
# 1 abcde efghi e
within(df2, foo <- strintr(df2[cols]))
# x y foo
# 1 abcde efghi e
# 2 lmnop uvmxw m
within(df3, foo <- strintr(df3[cols]))
# x y foo
# 1 abcde defghi d, e
# 2 lmnop uvmxw m
within(df4, foo <- strintr(df4[cols]))
# x y foo
# 1 abcde xyz <NA>
# 2 lmnop xyz <NA>
within(df5, foo <- strintr(df5[cols]))
# x y foo
# 1 abcde defghi d, e
# 2 lmnop xyz NA
Using $
.
df3$foo <- strintr(df3[cols])
df3
# x y foo
# 1 abcde defghi d, e
# 2 lmnop uvmxw m
Using dplyr::mutate
.
dplyr::mutate(df3, fo=strintr(df3[cols]))
# x y fo
# 1 abcde defghi d, e
# 2 lmnop uvmxw m
Note: This won't work with transform
due to some kind of bug.
Data:
df1 <- data.frame(x="abcde", y="efghi")
df2 <- data.frame(x=c('abcde', 'lmnop'),
y=c('efghi', 'uvmxw'))
df3 <- data.frame(x=c('abcde', 'lmnop'),
y=c('defghi', 'uvmxw'))
df4 <- data.frame(x=c('abcde', 'lmnop'),
y=c('xyz', 'xyz'))
df5 <- data.frame(x=c('abcde', 'lmnop'),
y=c('defghi', 'xyz'))