simple but not easy merge task in R-CodePudding

I have two incomplete dataframes: Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.

df_a = data.frame("by" = c(1,2,3),
                  "a" = c(1,2,3),
                  "b" = c(1,NA,3))

df_b = data.frame("by" = c(2,3,4),
                  "a" = c(7,NA,4),
                  "c" = c(2,3,4))

desired result:

  by a  b  c
1  1 1  1 NA
2  2 7 NA  2
3  3 3  3  3
4  4 4 NA  4

Does anyone has a clue how to do this efficiently with R base? Thanks in advance!

CodePudding user response：

This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.

# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]

# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)

# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
  
  x <- names(df)
  if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
    na.x <- is.na(df[[paste0(col_base, ".x")]])
    df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
    df[[paste0(col_base, ".x")]]
  } else {
    df[[col_base]]
  }
  
}

# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))

This will give the following result.

  by a  b  c
1  1 1  1 NA
2  2 2 NA  2
3  3 3  3  3
4  4 4 NA  4

I know you specified base, by the natural_join() function is worth mentioning.

library(rqdatatable)

natural_join(df_a, df_b, by = "by", jointype = "FULL")

This gives exactly what you want.

  by a  b  c
1  1 1  1 NA
2  2 2 NA  2
3  3 3  3  3
4  4 4 NA  4

CodePudding user response：

get_complete_df<-function(df_a,df_b, by = "by"){
  # df_a has priority!
  overlab_b = df_b[df_b[[by]] %in% df_a[[by]],names(df_b) %in% names(df_a)]
  overlab_a = df_a[df_a[[by]] %in% df_b[[by]],names(df_a) %in% names(df_b)] 
  df_a[df_a[[by]] %in% df_b[[by]],names(df_a) %in% names(df_b)] = ifelse(is.na(overlab_a ),unlist(overlab_b ),unlist(overlab_a )  )
  
  all_names  = unique(c(names(df_a),names(df_b)) )
  all_by     = unique(c(df_a[[by]],df_b[[by]]) )
  df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
  names(df_o) = all_names
  df_o[[by]] = all_by
  
  df_o[df_o[[by]] %in% df_b[[by]],names(df_o) %in% names(df_b)] =
    df_b[df_b[[by]] %in% df_o[[by]],names(df_b) %in% names(df_o)]
  
  df_o[df_o[[by]] %in% df_a[[by]],names(df_o) %in% names(df_a)] =
    df_a[df_a[[by]] %in% df_o[[by]],names(df_a) %in% names(df_o)]
  df_o
  
}

> get_complete_df(df_a,df_b)
by a  b  c
1  1 1  1 NA
2  2 7 NA  2
3  3 3  3  3
4  4 4 NA  4
> get_complete_df(df_a= df_b,df_b = df_a)
by a  c  b
1  1 1 NA  1
2  2 7  2 NA
3  3 3  3 NA
4  4 4  4  3
>

CodePudding user response：

Not the answer with R base. But one possible solution with the package data.table

library(data.table)

setDT(df_a)
setDT(df_b)

df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)


df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#>    by a  b  c
#> 1:  1 1  1 NA
#> 2:  2 2 NA  2
#> 3:  3 3  3  3
#> 4:  4 4 NA  4

Edit with the help of @r2evans, A much more elegant and efficient solution:

df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#>    by a  b  c
#> 1:  1 1  1 NA
#> 2:  2 2 NA  2
#> 3:  3 3  3  3
#> 4:  4 4 NA  4

^{Created on 2021-10-19 by the reprex package (v2.0.1)}