Home > Mobile >  Merging part of data frame into another data frame while preserving previous data
Merging part of data frame into another data frame while preserving previous data

Time:12-22

I have a data frame that looks like this:

> data
   Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3   X
1  AlanHenry    AlanHenry0.705     2109.350    0.705     AlanHenry0.708          2116.203    0.708 277
2  AlanHenry     AlanHenry0.82     2354.878    0.820     AlanHenry0.821          2355.950    0.821 303
3  AlanHenry     AlanHenry0.82     2354.878    0.820     AlanHenry0.856          2426.508    0.856 341
4  AlanHenry    AlanHenry0.912     2553.133    0.912     AlanHenry0.886          2505.336    0.886 243
5  AlanHenry    AlanHenry0.966     2660.238    0.966     AlanHenry0.958          2637.476    0.958 211
6  AlanHenry    AlanHenry1.009     2779.621    1.009     AlanHenry0.989          2694.901    0.989 173
17      <NA>  AmonGCarter0.724           NA       NA    AmonGCarter0.67                NA       NA 155
18      <NA>   AmonGCarter0.74           NA       NA   AmonGCarter0.629                NA       NA 568
19      <NA>    AmonGCarter0.8           NA       NA   AmonGCarter0.885                NA       NA 456
20      <NA>  AmonGCarter0.885           NA       NA   AmonGCarter0.842                NA       NA 244
21      <NA>  AmonGCarter0.914           NA       NA   AmonGCarter0.958                NA       NA  36
22      <NA>  AmonGCarter0.958           NA       NA   AmonGCarter1.035                NA       NA 417
23      <NA>  AmonGCarter0.976           NA       NA   AmonGCarter1.075                NA       NA  58
24      <NA>  AmonGCarter1.018           NA       NA    AmonGCarter0.92                NA       NA 603
25      <NA>  AmonGCarter1.035           NA       NA   AmonGCarter0.918                NA       NA 194


> dput(data)
structure(list(Lake_name = c("AlanHenry", "AlanHenry", "AlanHenry", 
"AlanHenry", "AlanHenry", "AlanHenry", NA, NA, NA, NA, NA, NA, 
NA, NA, NA), Lake_name_percent = c("AlanHenry0.705", "AlanHenry0.82", 
"AlanHenry0.82", "AlanHenry0.912", "AlanHenry0.966", "AlanHenry1.009", 
"AmonGCarter0.724", "AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885", 
"AmonGCarter0.914", "AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018", 
"AmonGCarter1.035"), surface_area = c(2109.35, 2354.878, 2354.878, 
2553.1325, 2660.238125, 2779.62076923077, NA, NA, NA, NA, NA, 
NA, NA, NA, NA), percent2 = c(0.705, 0.82, 0.82, 0.912, 0.966, 
1.009, NA, NA, NA, NA, NA, NA, NA, NA, NA), Lake_name_percent2 = c("AlanHenry0.708", 
"AlanHenry0.821", "AlanHenry0.856", "AlanHenry0.886", "AlanHenry0.958", 
"AlanHenry0.989", "AmonGCarter0.67", "AmonGCarter0.629", "AmonGCarter0.885", 
"AmonGCarter0.842", "AmonGCarter0.958", "AmonGCarter1.035", "AmonGCarter1.075", 
"AmonGCarter0.92", "AmonGCarter0.918"), prev.surface_area = c(2116.20333333333, 
2355.95, 2426.50833333333, 2505.33642857143, 2637.47595744681, 
2694.90068965517, NA, NA, NA, NA, NA, NA, NA, NA, NA), percent3 = c(0.708, 
0.821, 0.856, 0.886, 0.958, 0.989, NA, NA, NA, NA, NA, NA, NA, 
NA, NA), X = c(277L, 303L, 341L, 243L, 211L, 173L, 155L, 568L, 
456L, 244L, 36L, 417L, 58L, 603L, 194L)), class = "data.frame", row.names = c(1L, 
2L, 3L, 4L, 5L, 6L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L
))

I am trying to merge this data frame with a data frame that looks like this:

  surface_area Lake_name   percent2 Lake_name_percent
         <dbl> <chr>          <dbl> <chr>            
1        1159. AmonGCarter    0.724 AmonGCarter0.724 
2        1176. AmonGCarter    0.74  AmonGCarter0.74  
3        1240. AmonGCarter    0.8   AmonGCarter0.8   
4        1329. AmonGCarter    0.885 AmonGCarter0.885 
5        1360. AmonGCarter    0.914 AmonGCarter0.914 
6        1407. AmonGCarter    0.958 AmonGCarter0.958 
7        1426. AmonGCarter    0.976 AmonGCarter0.976 
8        1468  AmonGCarter    1.02  AmonGCarter1.018 
9        1484. AmonGCarter    1.03  AmonGCarter1.035 

> dput(AmonGCarter.sa)
structure(list(surface_area = c(1159.1, 1175.8, 1239.65714285714, 
1329.2, 1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), Lake_name = c("AmonGCarter", 
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", 
"AmonGCarter", "AmonGCarter", "AmonGCarter"), percent2 = c(0.724, 
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent = c("AmonGCarter0.724", 
"AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914", 
"AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"
)), row.names = c(NA, -9L), groups = structure(list(percent2 = c(0.724, 
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), .rows = structure(list(
    1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

When I merge the data sets like this:

merge(data,AmonGCarter.sa, all.y = T)

My data looks like this without the AlanHenry data:

> merge(data,AmonGCarter.sa, all.y = T)
    Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3  X
1 AmonGCarter  AmonGCarter0.724     1159.100    0.724               <NA>                NA       NA NA
2 AmonGCarter   AmonGCarter0.74     1175.800    0.740               <NA>                NA       NA NA
3 AmonGCarter    AmonGCarter0.8     1239.657    0.800               <NA>                NA       NA NA
4 AmonGCarter  AmonGCarter0.885     1329.200    0.885               <NA>                NA       NA NA
5 AmonGCarter  AmonGCarter0.914     1359.600    0.914               <NA>                NA       NA NA
6 AmonGCarter  AmonGCarter0.958     1406.704    0.958               <NA>                NA       NA NA
7 AmonGCarter  AmonGCarter0.976     1426.200    0.976               <NA>                NA       NA NA
8 AmonGCarter  AmonGCarter1.018     1468.000    1.018               <NA>                NA       NA NA
9 AmonGCarter  AmonGCarter1.035     1484.200    1.035               <NA>                NA       NA NA


> dput(merge(data,AmonGCarter.sa, all.y = T))
structure(list(Lake_name = c("AmonGCarter", "AmonGCarter", "AmonGCarter", 
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", 
"AmonGCarter"), Lake_name_percent = c("AmonGCarter0.724", "AmonGCarter0.74", 
"AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914", "AmonGCarter0.958", 
"AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"), 
    surface_area = c(1159.1, 1175.8, 1239.65714285714, 1329.2, 
    1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), percent2 = c(0.724, 
    0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent2 = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_
    ), prev.surface_area = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), percent3 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_), X = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_)), row.names = c(NA, -9L), class = "data.frame")

How do I go about adding the data from AmonGCarter into a merged dataframe that preserves the previous data from AlanHenry?

CodePudding user response:

You can use full_join in tidyverse. This creates additional columns (but we control the names with a suffix), but then we can use coalesce to combine like columns. Then, we can drop the extra columns.

library(tidyverse)

data %>%
  full_join(AmonGCarter.sa, by = 'Lake_name_percent', suffix = c('_x', '_y')) %>%
  mutate(across(ends_with('_x'), ~ coalesce(., get(
    sub('_x', '_y', cur_column())
  )),
  .names = '{sub("_x", "", {.col})}')) %>%
  select(!ends_with('_x') & !ends_with('_y')) %>%
  select(names(data))

Another easy way is to make use of natural_join from the rquery package. It will replace the NA values in data with the values from AmonGCarter.sa. Then, to get the columns in the original order as data, you can use select from dplyr.

library(rquery)
library(rqdatatable)
library(dplyr)

rquery::natural_join(data, AmonGCarter.sa, by = "Lake_name_percent", jointype = "FULL") %>%
  dplyr::select(names(data))

Output

     Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3   X
1    AlanHenry    AlanHenry0.705     2109.350    0.705     AlanHenry0.708          2116.203    0.708 277
2    AlanHenry     AlanHenry0.82     2354.878    0.820     AlanHenry0.821          2355.950    0.821 303
3    AlanHenry     AlanHenry0.82     2354.878    0.820     AlanHenry0.856          2426.508    0.856 341
4    AlanHenry    AlanHenry0.912     2553.133    0.912     AlanHenry0.886          2505.336    0.886 243
5    AlanHenry    AlanHenry0.966     2660.238    0.966     AlanHenry0.958          2637.476    0.958 211
6    AlanHenry    AlanHenry1.009     2779.621    1.009     AlanHenry0.989          2694.901    0.989 173
7  AmonGCarter  AmonGCarter0.724     1159.100    0.724    AmonGCarter0.67                NA       NA 155
8  AmonGCarter   AmonGCarter0.74     1175.800    0.740   AmonGCarter0.629                NA       NA 568
9  AmonGCarter    AmonGCarter0.8     1239.657    0.800   AmonGCarter0.885                NA       NA 456
10 AmonGCarter  AmonGCarter0.885     1329.200    0.885   AmonGCarter0.842                NA       NA 244
11 AmonGCarter  AmonGCarter0.914     1359.600    0.914   AmonGCarter0.958                NA       NA  36
12 AmonGCarter  AmonGCarter0.958     1406.704    0.958   AmonGCarter1.035                NA       NA 417
13 AmonGCarter  AmonGCarter0.976     1426.200    0.976   AmonGCarter1.075                NA       NA  58
14 AmonGCarter  AmonGCarter1.018     1468.000    1.018    AmonGCarter0.92                NA       NA 603
15 AmonGCarter  AmonGCarter1.035     1484.200    1.035   AmonGCarter0.918                NA       NA 194
  • Related