Anti_join between df1 and df2 but how to change all mismatch in df2 to NA-CodePudding

Below are my two dataframes, df1 and df2

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))

id     text response
1  632592651     asdf        y
2  633322173      cat        y
3  634703802      dog        y
4  634927873    mouse        n
5  635812953 elephant        n
6  636004739    goose        y
7  636101211      rat        y
8  636157799     mice        n
9  636263106    kitty        n
10 636752420   kitten        y

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))

 id           text volume
1   632592651       asdf_xyz   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  636809222      tiger_xyz      6
12 2004722036           lion    345
13 2004894388        leopard     23
14 2005045755        ostrich      2
15 2005535472       kangaroo   4778
16 2005630542       platypus    234
17 2005788781           fish   8675
18 2005809679        reptile   3459
19 2005838317        mammals      8
20 2005866692 amphibians_xyz      9

How do I change the non-matching items from row id1:20 of df2 to NA (i.e. all of them as no matching with df1) and the column 'text' (i.e. asdf_xyz) of id1 to NA?

I have tried

library(dplyr)

df3 <- df2 %>%
  anti_join(df1, by=c("id"))

id           text volume
1   636809222      tiger_xyz      6
2  2004722036           lion    345
3  2004894388        leopard     23
4  2005045755        ostrich      2
5  2005535472       kangaroo   4778
6  2005630542       platypus    234
7  2005788781           fish   8675
8  2005809679        reptile   3459
9  2005838317        mammals      8
10 2005866692 amphibians_xyz      9

df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA

(Doing this one by one because I couldn't find solution how to change the entire value of the dataframe to NA)

id text volume
1  <NA> <NA>   <NA>
2  <NA> <NA>   <NA>
3  <NA> <NA>   <NA>
4  <NA> <NA>   <NA>
5  <NA> <NA>   <NA>
6  <NA> <NA>   <NA>
7  <NA> <NA>   <NA>
8  <NA> <NA>   <NA>
9  <NA> <NA>   <NA>
10 <NA> <NA>   <NA>

and df4 (solution from How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'?)

inner_join(x = df1, 
           y = df2, 
           by = "id") %>%
  mutate_if(is.factor, as.character) %>%
  mutate(text = ifelse(test = text.x != text.y, 
                       yes = NA, 
                       no = text.x)) %>%
  select(id, text, response, volume)

id     text response volume
1  632592651     <NA>        y   1234
2  633322173      cat        y    432
3  634703802      dog        y    324
4  634927873    mouse        n    333
5  635812953 elephant        n   2223
6  636004739    goose        y 412346
7  636101211      rat        y   7456
8  636157799     mice        n   3456
9  636263106    kitty        n   2345
10 636752420   kitten        y   2345

but not sure how to replace df2 with df3 and df4. The desired output is shown below:

id           text volume
1   632592651       NA   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  NA               NA      NA
12  NA               NA      NA
13  NA               NA      NA
14  NA               NA      NA
15  NA               NA      NA
16  NA               NA      NA
17  NA               NA      NA
18  NA               NA      NA
19  NA               NA      NA
20  NA               NA      NA

Can someone help please? If possible, may I also know if there's a manual approach to select subset of df2 based on df3$id and change all values to NA?

CodePudding user response：

One potential solution for dealing with conflicts is to use the powerjoin package, e.g.

library(tidyverse)

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                  text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                  response=c("y","y","y","n","n","y","y","n","n","y"))

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))

expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
                            NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), text = c(NA, "cat", "dog", "mouse", "elephant", "goose", 
                             "rat", "mice", "kitty", "kitten", volume = c(1234, 432, 324, 333, 2223, 412346, 7456, 3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))

library(powerjoin)
joined_df <- power_full_join(df1, df2, by = c("id"),
                             conflict = rw ~ ifelse(.x != .y,
                                                    NA_integer_, 
                                                    .x))

final_df <- joined_df %>%
  mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
  select(id, text, volume)
final_df
#>           id     text volume
#> 1  632592651     <NA>   1234
#> 2  633322173      cat    432
#> 3  634703802      dog    324
#> 4  634927873    mouse    333
#> 5  635812953 elephant   2223
#> 6  636004739    goose 412346
#> 7  636101211      rat   7456
#> 8  636157799     mice   3456
#> 9  636263106    kitty   2345
#> 10 636752420   kitten   2345
#> 11      <NA>     <NA>     NA
#> 12      <NA>     <NA>     NA
#> 13      <NA>     <NA>     NA
#> 14      <NA>     <NA>     NA
#> 15      <NA>     <NA>     NA
#> 16      <NA>     <NA>     NA
#> 17      <NA>     <NA>     NA
#> 18      <NA>     <NA>     NA
#> 19      <NA>     <NA>     NA
#> 20      <NA>     <NA>     NA

all_equal(final_df, expected_outcome)
#> [1] TRUE

^{Created on 2022-07-01 by the reprex package (v2.0.1)}

You can also use different joins to identify unique entries in df1 and entries common to both df1 and df2, e.g.

library(dplyr)

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                  text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                  response=c("y","y","y","n","n","y","y","n","n","y"))

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))

unique_to_df1 <- df1 %>%
  anti_join(df2)
#> Joining, by = c("id", "text")

unique_to_df1
#>          id text response
#> 1 632592651 asdf        y

in_df1_and_df2 <- df1 %>%
  semi_join(df2)
#> Joining, by = c("id", "text")

in_df1_and_df2
#>          id     text response
#> 1 633322173      cat        y
#> 2 634703802      dog        y
#> 3 634927873    mouse        n
#> 4 635812953 elephant        n
#> 5 636004739    goose        y
#> 6 636101211      rat        y
#> 7 636157799     mice        n
#> 8 636263106    kitty        n
#> 9 636752420   kitten        y

^{Created on 2022-07-01 by the reprex package (v2.0.1)}

CodePudding user response：

data.table version using an !antijoin, and overwriting := all columns/rows returned in df2 with an NA (recycled list .(NA) to all columns).
Then looping over all the common variables and overwriting any values which don't match by id:

library(data.table)
setDT(df1)
setDT(df2)

df2[!df1, on=.(id), names(df2) := .(NA)]
idvars <- "id"
compvars <- setdiff(intersect(names(df1), names(df2)), idvars)
for (i in compvars) {
    df2[!df1, on=c(idvars,i), (i) := NA]
}

#           id     text volume
# 1: 632592651     <NA>   1234
# 2: 633322173      cat    432
# 3: 634703802      dog    324
# 4: 634927873    mouse    333
# 5: 635812953 elephant   2223
# 6: 636004739    goose 412346
# 7: 636101211      rat   7456
# 8: 636157799     mice   3456
# 9: 636263106    kitty   2345
#10: 636752420   kitten   2345
#11:      <NA>     <NA>   <NA>
#12:      <NA>     <NA>   <NA>
#13:      <NA>     <NA>   <NA>
#14:      <NA>     <NA>   <NA>
#15:      <NA>     <NA>   <NA>
#16:      <NA>     <NA>   <NA>
#17:      <NA>     <NA>   <NA>
#18:      <NA>     <NA>   <NA>
#19:      <NA>     <NA>   <NA>
#20:      <NA>     <NA>   <NA>