How to get matching characters?-CodePudding

I'm trying to get common characters from two separate vectors.

Example:

x <- c("abcde")
y <- c("efghi")
df <- data.frame(x, y)

Desired output

    x       y     z 
abcde   efghi     e     
lmnop   uvmxw     m

I've tried something like this, but it is a bad result:

df |> mutate(m = unique(x, y))

If there are multiple matches, returning a list would work great.

CodePudding user response：

str_intersect <- function(s1,s2) {
  paste0(intersect(strsplit(s1,"")[[1]],strsplit(s2,"")[[1]]),collapse = "")
}

x <- c("abcde","abc")
y <- c("efghi","b")
df <- data.frame(x, y)

library(dplyr)
df %>%
  rowwise() %>%
  mutate(m = str_intersect(x,y))

CodePudding user response：

Using R base approach:

> df$z <- intersect(unlist(strsplit(df$x, "")), unlist(strsplit(df$y, "")))
> df
      x     y z
1 abcde efghi e
2 lmnop uvmxw m

Data

structure(list(x = c("abcde", "lmnop"), y = c("efghi", "uvmxw"
), z = c("e", "m")), row.names = c(NA, -2L), class = "data.frame")

CodePudding user response：

Here's a tidyverse solution with functions from stringr, which can also handle multiple common characters:

library(stringr)
df %>%
  mutate(
    # convert `x` to alternation pattern:
    y1 = str_replace_all(x, "(?<=.)(?=.)", "|"),
    # which of `y1` are contained in `x`?:
    match = str_extract_all(y, y1)
         ) 
      x     y        y1 match
1 abcde efghi a|b|c|d|e     e
2 lmnop ovmxw l|m|n|o|p  o, m

You can remove y1by adding %>% select(-y1)

Data:

x <- c("abcde", "lmnop")
y <- c("efghi", "ovmxw")
df <- data.frame(x, y)

CodePudding user response：

Here is one method where we update the 'y' column by wrapping it inside the [] and add the ^ so that all those characters other than those will be matched as pattern and gets removed with str_remove_all

library(stringr)
library(dplyr)
df %>%
   mutate(z = str_remove_all(x, sprintf("[^%s]", y)))

-output

      x     y z
1 abcde efghi e
2 lmnop uvmxw m

It also handles multiple characters,

df1 %>%
    mutate(z = str_remove_all(x, sprintf("[^%s]", y)))
      x     y  z
1 abcde efghi  e
2 lmnop ovmxw mo

data

df <- structure(list(x = c("abcde", "lmnop"), y = c("efghi", "uvmxw"
)), row.names = c(NA, -2L), class = "data.frame")
df1 <- structure(list(x = c("abcde", "lmnop"), y = c("efghi", "ovmxw"
)), class = "data.frame", row.names = c(NA, -2L))

CodePudding user response：

Using strsplit and intersect in a function with some case handling.

strintr <- \(x) {
  o <- apply(x, 1, \(.) do.call(intersect, strsplit(., '')))
  dx <- dim(x)[1]
  if (!identical(o, dx)) length(o) <- dx
  o[lengths(o) == 0L] <- NA_character_
  if (any(lengths(o) > 1L)) lapply(o, as.list) else o
}

Usage

cols <- c('x', 'y')

Using within.

within(df1, foo <- strintr(df1[cols]))
#       x     y foo
# 1 abcde efghi   e
within(df2, foo <- strintr(df2[cols]))
#       x     y foo
# 1 abcde efghi   e
# 2 lmnop uvmxw   m
within(df3, foo <- strintr(df3[cols]))
#       x      y  foo
# 1 abcde defghi d, e
# 2 lmnop  uvmxw    m
within(df4, foo <- strintr(df4[cols]))
#       x   y  foo
# 1 abcde xyz <NA>
# 2 lmnop xyz <NA>
within(df5, foo <- strintr(df5[cols]))
#       x      y  foo
# 1 abcde defghi d, e
# 2 lmnop    xyz   NA

Using $.

df3$foo <- strintr(df3[cols])
df3
#       x      y  foo
# 1 abcde defghi d, e
# 2 lmnop  uvmxw    m

Using dplyr::mutate.

dplyr::mutate(df3, fo=strintr(df3[cols]))
#       x      y   fo
# 1 abcde defghi d, e
# 2 lmnop  uvmxw    m

Note: This won't work with transform due to some kind of bug.

Data:

df1 <- data.frame(x="abcde", y="efghi")
df2 <- data.frame(x=c('abcde', 'lmnop'),
                  y=c('efghi', 'uvmxw'))
df3 <- data.frame(x=c('abcde', 'lmnop'),
                  y=c('defghi', 'uvmxw'))
df4 <- data.frame(x=c('abcde', 'lmnop'),
                  y=c('xyz', 'xyz'))
df5 <- data.frame(x=c('abcde', 'lmnop'),
                  y=c('defghi', 'xyz'))