Merge Tables in R-CodePudding

I'm trying to merge two data.frames in R so that the values in test_1 are overwritten if they exist in test_2

Every time I try and join/merge them I end up with a bunch of NAs.

I can't work out a simple logic for if.na then use test_2. Is that what I'd need to do, or is there an easier way?

test_1 = structure(list(rn = c("Red", "Blue", 
                      "Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(0, 0, 0, 0, 0, 0), X2022.08.02 = c(0, 0, 0, 
                                                      0, 0, 0), X2022.08.03 = c(0, 0, 0, 0, 0, 0), X2022.08.04 = c(0, 
                                                                                                                   0, 0, 0, 0, 0), X2022.08.05 = c(0, 0, 0, 0, 0, 0), X2022.08.08 = c(0, 
                                                                                                                                                                                      0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0, 
                                                                                                                                                                                                                                                         0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0, 
                                                                                                                                                                                                                                                                                                                            0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                               0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                  0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           0, 0, 0, 0, 0), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                6L), class = "data.frame")



test_2 = structure(list(rn = c("Blue", "Pink", 
                               "Red", "Yellow", "Green", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.03 = c(10, 10, 10, 
                                                      10, 10, 10), X2022.08.04 = c(10, 10, 10, 10, 10, 10), X2022.08.05 = c(10, 
                                                                                                                   10, 10, 10, 10, 10), X2022.08.26 = c(10, 10, 10, 10, 10, 10)), row.names = c(NA, 
                                                                                                                                                                                     6L), class = "data.frame")

the desired output would look like this:




test_output = structure(list(rn = c("Red", "Blue", 
                                             "Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.02 = c(0, 0, 0, 
                                                      0, 0, 0), X2022.08.03 = c(10, 10, 10, 10, 10, 10), X2022.08.04 = c(10, 
                                                                                                                   10, 10, 10, 10, 10), X2022.08.05 = c(10, 10, 10, 10, 10, 10), X2022.08.08 = c(0, 
                                                                                                                                                                                      0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0, 
                                                                                                                                                                                                                                                         0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0, 
                                                                                                                                                                                                                                                                                                                            0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                               0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                  0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(10, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           10, 10, 10, 10, 10), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                6L), class = "data.frame")

CodePudding user response：

If you use the column names of the source as indices on both sides of the assignment, you can get replacement with

test_1[ ,colnames(test_2)] <- test_2[ , colnames(test_2)]

test_1
      rn X2022.08.01 X2022.08.02 X2022.08.03 X2022.08.04 X2022.08.05 X2022.08.08 X2022.08.09
1   Blue          10           0          10          10          10           0           0
2   Pink          10           0          10          10          10           0           0
3    Red          10           0          10          10          10           0           0
4 Yellow          10           0          10          10          10           0           0
5  Green          10           0          10          10          10           0           0
6   Gold          10           0          10          10          10           0           0
  X2022.08.10 X2022.08.11 X2022.08.12 X2022.08.15 X2022.08.16 X2022.08.17 X2022.08.18 X2022.08.19
1           0           0           0           0           0           0           0           0
2           0           0           0           0           0           0           0           0
3           0           0           0           0           0           0           0           0
4           0           0           0           0           0           0           0           0
5           0           0           0           0           0           0           0           0
6           0           0           0           0           0           0           0           0
  X2022.08.22 X2022.08.23 X2022.08.24 X2022.08.25 X2022.08.26 X2022.08.29 X2022.08.30 X2022.08.31
1           0           0           0           0          10           0           0           0
2           0           0           0           0          10           0           0           0
3           0           0           0           0          10           0           0           0
4           0           0           0           0          10           0           0           0
5           0           0           0           0          10           0           0           0
6           0           0           0           0          10           0           0           0

I doubt there could be a more simple method.. It could even work with a more limited number of rows as long as the indices area proper subset of the column and row names of the target matrix or dataframe. Note: I'm not understanding the issue with NA's. There were not NA's in either structure.

CodePudding user response：

There is probably a more elegant way of achieving this, but is this producing what you expect?

library(tidyverse)

# transpose both tables
test_1 <- test_1[-1] %>%
  t() %>%
  as.data.frame() %>%
  setNames(test_1[,1]) %>%
  rownames_to_column("rn")

test_2 <- test_2[-1] %>%
  t() %>%
  as.data.frame() %>%
  setNames(test_2[,1]) %>%
  rownames_to_column("rn")

# join the antijoin to test_2 to "replace" the values
test_1<- test_2 %>% 
  full_join(test_1 %>%
              anti_join(test_2, by = "rn"))

# back to the original layout
test_1 <- test_1[-1] %>%
  t() %>%
  as.data.frame() %>%
  setNames(test_1[,1]) %>%
  rownames_to_column("rn")

CodePudding user response：

One option is to use the powerjoin package and specify how to handle conflicts between tables:

library(dplyr)
library(powerjoin)

test_1 = structure(list(rn = c("Red", "Blue", 
                               "Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(0, 0, 0, 0, 0, 0), X2022.08.02 = c(0, 0, 0, 
                                                      0, 0, 0), X2022.08.03 = c(0, 0, 0, 0, 0, 0), X2022.08.04 = c(0, 
                                                                                                                   0, 0, 0, 0, 0), X2022.08.05 = c(0, 0, 0, 0, 0, 0), X2022.08.08 = c(0, 
                                                                                                                                                                                      0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0, 
                                                                                                                                                                                                                                                         0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0, 
                                                                                                                                                                                                                                                                                                                            0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                               0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                  0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           0, 0, 0, 0, 0), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                6L), class = "data.frame")



test_2 = structure(list(rn = c("Blue", "Pink", 
                               "Red", "Yellow", "Green", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.03 = c(10, 10, 10, 
                                                            10, 10, 10), X2022.08.04 = c(10, 10, 10, 10, 10, 10), X2022.08.05 = c(10, 
                                                                                                                                  10, 10, 10, 10, 10), X2022.08.26 = c(10, 10, 10, 10, 10, 10)), row.names = c(NA, 
                                                                                                                                                                                                               6L), class = "data.frame")
test_output = structure(list(rn = c("Red", "Blue", 
                                    "Green", "Yellow", "Pink", "Gold"
), X2022.08.01 = c(10, 10, 10, 10, 10, 10), X2022.08.02 = c(0, 0, 0, 
                                                            0, 0, 0), X2022.08.03 = c(10, 10, 10, 10, 10, 10), X2022.08.04 = c(10, 
                                                                                                                               10, 10, 10, 10, 10), X2022.08.05 = c(10, 10, 10, 10, 10, 10), X2022.08.08 = c(0, 
                                                                                                                                                                                                             0, 0, 0, 0, 0), X2022.08.09 = c(0, 0, 0, 0, 0, 0), X2022.08.10 = c(0, 
                                                                                                                                                                                                                                                                                0, 0, 0, 0, 0), X2022.08.11 = c(0, 0, 0, 0, 0, 0), X2022.08.12 = c(0, 
                                                                                                                                                                                                                                                                                                                                                   0, 0, 0, 0, 0), X2022.08.15 = c(0, 0, 0, 0, 0, 0), X2022.08.16 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                      0, 0, 0, 0, 0), X2022.08.17 = c(0, 0, 0, 0, 0, 0), X2022.08.18 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         0, 0, 0, 0, 0), X2022.08.19 = c(0, 0, 0, 0, 0, 0), X2022.08.22 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            0, 0, 0, 0, 0), X2022.08.23 = c(0, 0, 0, 0, 0, 0), X2022.08.24 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               0, 0, 0, 0, 0), X2022.08.25 = c(0, 0, 0, 0, 0, 0), X2022.08.26 = c(10, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  10, 10, 10, 10, 10), X2022.08.29 = c(0, 0, 0, 0, 0, 0), X2022.08.30 = c(0, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          0, 0, 0, 0, 0), X2022.08.31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            6L), class = "data.frame")


output <- power_left_join(test_2, test_1, by = "rn", conflict = coalesce_xy)
all_equal(test_output, output)
#> [1] TRUE

^{Created on 2022-10-24 by the reprex package (v2.0.1)}