R how to combine two data frame based on the compare-CodePudding

I have two dataframe like this

    ```
             v1.       v2
    1     a,b,c        1,2,3
    2     d,e,f,g      4,6
    3     h,k,v,x      9,0
    ```

    ```
            v1          v2
    1       a           AA
    2       c           CC
    3       d           DD
    ```

after combine

    ```
           v1           v2            v3      
    1    a,b,c         1,2,3         AA,CC
    2.   d,e,f,g       4,6           DD
    3    h,k,v,x       9,0 
    ```

I dont know how to perform like this , any comment would be appreciated

CodePudding user response：

library(tidyverse)
library(fuzzyjoin)
df1 %>%
  regex_left_join(., df2, by = c(v1 = "v1")) %>%
  group_by(v1 = v1.x, v2 = v2.x) %>%
  summarise(v3 = paste0(v2.y, collapse = ","))
#   v1      v2    v3   
#   <chr>   <chr> <chr>
# 1 a,b,c   1,2,3 AA,CC
# 2 d,e,f,g 4,6   DD   
# 3 h,k,v,x 9,0   NA

Sample data used

df1 <- read.table(text = "v1       v2
   a,b,c        1,2,3
     d,e,f,g      4,6
     h,k,v,x      9,0", header = TRUE)


df2 <- read.table(text = "v1          v2
       a           AA
       c           CC
       d           DD", header = TRUE)

CodePudding user response：

Here is a very long tidyverse pipe. There should be simpler solutions.

library(dplyr)
library(tidyr)

df1 %>%
  mutate(id = row_number()) %>%
  separate(v1, into = c("v1a", "v1b", "v1c", "v1d"), fill = "right") %>%
  pivot_longer(
    cols = starts_with("v1"),
    names_to = "v1_col",
    values_to = "v1"
  ) %>%
  na.omit() %>%
  separate(v2, into = c("v2a", "v2b", "v2c"), fill = "right") %>%
  pivot_longer(
    cols = starts_with("v2"),
    names_to = "v2_col",
    values_to = "v2_value"
  ) %>%
  na.omit() %>%
  select(-ends_with("col")) %>%
  left_join(df2, by = "v1") %>%
  group_by(id, v1, v2) %>%
  summarise(v2_value = paste(v2_value, collapse = ","),
            .groups = "drop") %>%
  group_by(id, v2_value) %>%
  summarise(v1 = paste(v1, collapse = ","),
            v3 = paste(na.omit(v2), collapse = ","), 
            .groups = "drop") %>%
  ungroup() %>%
  select(-id) %>%
  rename(v2 = v2_value) %>%
  relocate(v2, .after = v1)
## A tibble: 3 x 3
#  v1      v2    v3     
#  <chr>   <chr> <chr>  
#1 a,b,c   1,2,3 "AA,CC"
#2 d,e,f,g 4,6   "DD"   
#3 h,k,v,x 9,0   ""