Home > Enterprise >  Combine different dataframes with some matching columns and rows and some distinct or missing ones
Combine different dataframes with some matching columns and rows and some distinct or missing ones

Time:07-01

I have two df to be combined into one. These df have some matching columns and rows and some distinct or missing ones.

df1:

df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
    Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
    H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
    W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
    X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
    Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
  row.names = c(NA,-12L),  class = c("tbl_df","tbl", "data.frame"))

df2:

df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
    Name = c("LI","NO", "WH", "MA", "BU", "SO"),
    H_A = c("H", "A", "H", "A", "H", "A"),
    W = c(15, 13, 5, 13, 9, 12),
    X = c(10, 12, 11, 15, 6, 14),
    Z = c(4, 14, 16, 16, 25, 30)),
  row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))

The desired df combined is this:

df_combined <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
    Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
    H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
    W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
    X = c(10, 12, 11, 15, 6, 14, 12, 7, 5, 13, 1, 3),
    Y = c("0", "0", "0", "0", "0", "0", "NA", "NA", "NA", "NA", "NA", "NA"),
    Z = c("4", "14", "16", "16", "25", "30", "NA", "NA", "NA", "NA", "NA", "NA")), 
row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame"))
 

CodePudding user response:

Joins by their nature don't natively fill in positions we have to implement a fix to solve this problem, and although you can use if else statements as shown in the answer above, coalesce() is a much cleaner function to use.

See this post here for another example (could potentially be seen as a repeated question).

Using dplyr to fill in missing values (through a join?)

library(tidyverse)


df_test <- full_join(df1, df2, by = c("id", "Name", "H_A")) %>% 
  mutate(X = coalesce(X.x, X.y),
         W = coalesce(W.x, W.y)) %>% 
  select(id, Name, H_A, W, X, Y, Z)

df_test == df_combined

      id Name  H_A    W    X    Y    Z
 [1,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [2,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [3,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [4,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [5,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [6,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
 [7,] TRUE TRUE TRUE TRUE TRUE   NA   NA
 [8,] TRUE TRUE TRUE TRUE TRUE   NA   NA
 [9,] TRUE TRUE TRUE TRUE TRUE   NA   NA
[10,] TRUE TRUE TRUE TRUE TRUE   NA   NA
[11,] TRUE TRUE TRUE TRUE TRUE   NA   NA
[12,] TRUE TRUE TRUE TRUE TRUE   NA   NA

NA's expectedly return NA as you can't match two NA's together using a simple == statement.

CodePudding user response:

You can use left_join from dplyr and substitute NA's like this, where I am guessing Id and H_A together make a key value:

library(dplyr)
df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
                      Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
                      H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
                      W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
                      X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
                      Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
                 row.names = c(NA,-12L),  class = c("tbl_df","tbl", "data.frame"))
df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
                      Name = c("LI","NO", "WH", "MA", "BU", "SO"),
                      H_A = c("H", "A", "H", "A", "H", "A"),
                      W = c(15, 13, 5, 13, 9, 12),
                      X = c(10, 12, 11, 15, 6, 14),
                      Z = c(4, 14, 16, 16, 25, 30)),
                 row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))

df_combined <- left_join(df1, 
                         df2 %>% 
                           select(id, H_A, "df2_X" = X, Z))  %>%
  mutate(X = if_else(is.na(X), df2_X, X)) %>% 
  select(-df2_X)
#> Joining, by = c("id", "H_A")

df_combined
#> # A tibble: 12 × 7
#>       id Name  H_A       W     X     Y     Z
#>    <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#>  1     1 LI    H        15    10     0     4
#>  2     1 NO    A        13    12     0    14
#>  3     2 WH    H         5    11     0    16
#>  4     2 MA    A        13    15     0    16
#>  5     3 BU    H         9     6     0    25
#>  6     3 SO    A        12    14     0    30
#>  7     4 FO    H        10    12    NA    NA
#>  8     4 AT    A        13     7    NA    NA
#>  9     5 CO    H         1     5    NA    NA
#> 10     5 IN    A         8    13    NA    NA
#> 11     6 SP    H         4     1    NA    NA
#> 12     6 CE    A         2     3    NA    NA

CodePudding user response:

data.table approach

library(data.table)
# set to data.table format
setDT(df1); setDT(df2)
# perform an update join, overwriting NA-values in W, X and Y, and 
# adding Z, based on key-columns ID, Name and H_A
df1[df2, `:=`(W = ifelse(is.na(W), i.W, W), 
              X = ifelse(is.na(X), i.X, X),
              Y = ifelse(is.na(Y), i.Y, Y),
              Z = i.Z),
    on = .(id, Name, H_A)][]

# id Name H_A  W  X  Y  Z
# 1:  1   LI   H 15 10  0  4
# 2:  1   NO   A 13 12  0 14
# 3:  2   WH   H  5 11  0 16
# 4:  2   MA   A 13 15  0 16
# 5:  3   BU   H  9  6  0 25
# 6:  3   SO   A 12 14  0 30
# 7:  4   FO   H 10 12 NA NA
# 8:  4   AT   A 13  7 NA NA
# 9:  5   CO   H  1  5 NA NA
#10:  5   IN   A  8 13 NA NA
#11:  6   SP   H  4  1 NA NA
#12:  6   CE   A  2  3 NA NA
  • Related