I have two df to be combined into one. These df have some matching columns and rows and some distinct or missing ones.
df1:
df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
row.names = c(NA,-12L), class = c("tbl_df","tbl", "data.frame"))
df2:
df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
Name = c("LI","NO", "WH", "MA", "BU", "SO"),
H_A = c("H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12),
X = c(10, 12, 11, 15, 6, 14),
Z = c(4, 14, 16, 16, 25, 30)),
row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))
The desired df combined is this:
df_combined <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
X = c(10, 12, 11, 15, 6, 14, 12, 7, 5, 13, 1, 3),
Y = c("0", "0", "0", "0", "0", "0", "NA", "NA", "NA", "NA", "NA", "NA"),
Z = c("4", "14", "16", "16", "25", "30", "NA", "NA", "NA", "NA", "NA", "NA")),
row.names = c(NA, -12L), class = c("tbl_df", "tbl", "data.frame"))
CodePudding user response:
Joins by their nature don't natively fill in positions we have to implement a fix to solve this problem, and although you can use if else statements as shown in the answer above, coalesce()
is a much cleaner function to use.
See this post here for another example (could potentially be seen as a repeated question).
Using dplyr to fill in missing values (through a join?)
library(tidyverse)
df_test <- full_join(df1, df2, by = c("id", "Name", "H_A")) %>%
mutate(X = coalesce(X.x, X.y),
W = coalesce(W.x, W.y)) %>%
select(id, Name, H_A, W, X, Y, Z)
df_test == df_combined
id Name H_A W X Y Z
[1,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[2,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[3,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[4,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[5,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[6,] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[7,] TRUE TRUE TRUE TRUE TRUE NA NA
[8,] TRUE TRUE TRUE TRUE TRUE NA NA
[9,] TRUE TRUE TRUE TRUE TRUE NA NA
[10,] TRUE TRUE TRUE TRUE TRUE NA NA
[11,] TRUE TRUE TRUE TRUE TRUE NA NA
[12,] TRUE TRUE TRUE TRUE TRUE NA NA
NA's expectedly return NA as you can't match two NA's together using a simple ==
statement.
CodePudding user response:
You can use left_join
from dplyr
and substitute NA's like this, where I am guessing Id
and H_A
together make a key value:
library(dplyr)
df1 <- structure(list(id = c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6),
Name = c("LI","NO","WH","MA","BU","SO","FO","AT","CO","IN","SP","CE"),
H_A = c("H", "A", "H", "A", "H", "A", "H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12, 10, 13, 1, 8, 4, 2),
X = c(NA, NA, NA, NA, NA, NA, 12, 7, 5, 13, 1, 3),
Y = c(0, 0, 0, 0, 0,0, NA, NA, NA, NA, NA, NA)),
row.names = c(NA,-12L), class = c("tbl_df","tbl", "data.frame"))
df2 <- structure(list(id = c(1, 1, 2, 2, 3, 3),
Name = c("LI","NO", "WH", "MA", "BU", "SO"),
H_A = c("H", "A", "H", "A", "H", "A"),
W = c(15, 13, 5, 13, 9, 12),
X = c(10, 12, 11, 15, 6, 14),
Z = c(4, 14, 16, 16, 25, 30)),
row.names = c(NA,-6L),class = c("tbl_df", "tbl", "data.frame"))
df_combined <- left_join(df1,
df2 %>%
select(id, H_A, "df2_X" = X, Z)) %>%
mutate(X = if_else(is.na(X), df2_X, X)) %>%
select(-df2_X)
#> Joining, by = c("id", "H_A")
df_combined
#> # A tibble: 12 × 7
#> id Name H_A W X Y Z
#> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 LI H 15 10 0 4
#> 2 1 NO A 13 12 0 14
#> 3 2 WH H 5 11 0 16
#> 4 2 MA A 13 15 0 16
#> 5 3 BU H 9 6 0 25
#> 6 3 SO A 12 14 0 30
#> 7 4 FO H 10 12 NA NA
#> 8 4 AT A 13 7 NA NA
#> 9 5 CO H 1 5 NA NA
#> 10 5 IN A 8 13 NA NA
#> 11 6 SP H 4 1 NA NA
#> 12 6 CE A 2 3 NA NA
CodePudding user response:
data.table
approach
library(data.table)
# set to data.table format
setDT(df1); setDT(df2)
# perform an update join, overwriting NA-values in W, X and Y, and
# adding Z, based on key-columns ID, Name and H_A
df1[df2, `:=`(W = ifelse(is.na(W), i.W, W),
X = ifelse(is.na(X), i.X, X),
Y = ifelse(is.na(Y), i.Y, Y),
Z = i.Z),
on = .(id, Name, H_A)][]
# id Name H_A W X Y Z
# 1: 1 LI H 15 10 0 4
# 2: 1 NO A 13 12 0 14
# 3: 2 WH H 5 11 0 16
# 4: 2 MA A 13 15 0 16
# 5: 3 BU H 9 6 0 25
# 6: 3 SO A 12 14 0 30
# 7: 4 FO H 10 12 NA NA
# 8: 4 AT A 13 7 NA NA
# 9: 5 CO H 1 5 NA NA
#10: 5 IN A 8 13 NA NA
#11: 6 SP H 4 1 NA NA
#12: 6 CE A 2 3 NA NA