Combine different dataframes with some matching columns and rows and some distinct or missing ones-CodePudding

I have two df to be combined into one. These df have some matching columns and rows and some distinct or missing ones.

df1:

df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
    Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
    H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
    W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
    X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
    Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
  row.names = c(NA,-12L),  class = c("tbl_df","tbl", "data.frame"))

df2:

df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
    Name = c("LI","NO", "WH", "MA", "BU", "SO"),
    H_A = c("H", "A", "H", "A", "H", "A"),
    W = c(15, 13, 5, 13, 9, 12),
    X = c(10, 12, 11, 15, 6, 14),
    Z = c(4, 14, 16, 16, 25, 30)),
  row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))

The desired df combined is this:

df_combined <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
    Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
    H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
    W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
    X = c(10, 12, 11, 15, 6, 14, 12, 7, 5, 13, 1, 3),
    Y = c("0", "0", "0", "0", "0", "0", "NA", "NA", "NA", "NA", "NA", "NA"),
    Z = c("4", "14", "16", "16", "25", "30", "NA", "NA", "NA", "NA", "NA", "NA")), 
row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame"))

CodePudding user response：

Joins by their nature don't natively fill in positions we have to implement a fix to solve this problem, and although you can use if else statements as shown in the answer above, coalesce() is a much cleaner function to use.

See this post here for another example (could potentially be seen as a repeated question).

Using dplyr to fill in missing values (through a join?)

library(tidyverse)


df_test <- full_join(df1, df2, by = c("id", "Name", "H_A")) %>% 
  mutate(X = coalesce(X.x, X.y),
         W = coalesce(W.x, W.y)) %>% 
  select(id, Name, H_A, W, X, Y, Z)

df_test == df_combined

      id Name  H_A    W    X    Y    Z
 [1,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [2,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [3,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [4,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [5,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [6,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [7,] TRUE TRUE TRUE TRUE TRUE   NA   NA
 [8,] TRUE TRUE TRUE TRUE TRUE   NA   NA
 [9,] TRUE TRUE TRUE TRUE TRUE   NA   NA
[10,] TRUE TRUE TRUE TRUE TRUE   NA   NA
[11,] TRUE TRUE TRUE TRUE TRUE   NA   NA
[12,] TRUE TRUE TRUE TRUE TRUE   NA   NA

NA's expectedly return NA as you can't match two NA's together using a simple == statement.

CodePudding user response：

You can use left_join from dplyr and substitute NA's like this, where I am guessing Id and H_A together make a key value:

library(dplyr)
df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
                      Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
                      H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
                      W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
                      X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
                      Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
                 row.names = c(NA,-12L),  class = c("tbl_df","tbl", "data.frame"))
df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
                      Name = c("LI","NO", "WH", "MA", "BU", "SO"),
                      H_A = c("H", "A", "H", "A", "H", "A"),
                      W = c(15, 13, 5, 13, 9, 12),
                      X = c(10, 12, 11, 15, 6, 14),
                      Z = c(4, 14, 16, 16, 25, 30)),
                 row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))

df_combined <- left_join(df1, 
                         df2 %>% 
                           select(id, H_A, "df2_X" = X, Z))  %>%
  mutate(X = if_else(is.na(X), df2_X, X)) %>% 
  select(-df2_X)
#> Joining, by = c("id", "H_A")

df_combined
#> # A tibble: 12 × 7
#>       id Name  H_A       W     X     Y     Z
#>    <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#>  1     1 LI    H        15    10     0     4
#>  2     1 NO    A        13    12     0    14
#>  3     2 WH    H         5    11     0    16
#>  4     2 MA    A        13    15     0    16
#>  5     3 BU    H         9     6     0    25
#>  6     3 SO    A        12    14     0    30
#>  7     4 FO    H        10    12    NA    NA
#>  8     4 AT    A        13     7    NA    NA
#>  9     5 CO    H         1     5    NA    NA
#> 10     5 IN    A         8    13    NA    NA
#> 11     6 SP    H         4     1    NA    NA
#> 12     6 CE    A         2     3    NA    NA

CodePudding user response：

data.table approach

library(data.table)
# set to data.table format
setDT(df1); setDT(df2)
# perform an update join, overwriting NA-values in W, X and Y, and 
# adding Z, based on key-columns ID, Name and H_A
df1[df2, `:=`(W = ifelse(is.na(W), i.W, W), 
              X = ifelse(is.na(X), i.X, X),
              Y = ifelse(is.na(Y), i.Y, Y),
              Z = i.Z),
    on = .(id, Name, H_A)][]

# id Name H_A  W  X  Y  Z
# 1:  1   LI   H 15 10  0  4
# 2:  1   NO   A 13 12  0 14
# 3:  2   WH   H  5 11  0 16
# 4:  2   MA   A 13 15  0 16
# 5:  3   BU   H  9  6  0 25
# 6:  3   SO   A 12 14  0 30
# 7:  4   FO   H 10 12 NA NA
# 8:  4   AT   A 13  7 NA NA
# 9:  5   CO   H  1  5 NA NA
#10:  5   IN   A  8 13 NA NA
#11:  6   SP   H  4  1 NA NA
#12:  6   CE   A  2  3 NA NA