I have a data frame that looks like this:
> data
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
17 <NA> AmonGCarter0.724 NA NA AmonGCarter0.67 NA NA 155
18 <NA> AmonGCarter0.74 NA NA AmonGCarter0.629 NA NA 568
19 <NA> AmonGCarter0.8 NA NA AmonGCarter0.885 NA NA 456
20 <NA> AmonGCarter0.885 NA NA AmonGCarter0.842 NA NA 244
21 <NA> AmonGCarter0.914 NA NA AmonGCarter0.958 NA NA 36
22 <NA> AmonGCarter0.958 NA NA AmonGCarter1.035 NA NA 417
23 <NA> AmonGCarter0.976 NA NA AmonGCarter1.075 NA NA 58
24 <NA> AmonGCarter1.018 NA NA AmonGCarter0.92 NA NA 603
25 <NA> AmonGCarter1.035 NA NA AmonGCarter0.918 NA NA 194
> dput(data)
structure(list(Lake_name = c("AlanHenry", "AlanHenry", "AlanHenry",
"AlanHenry", "AlanHenry", "AlanHenry", NA, NA, NA, NA, NA, NA,
NA, NA, NA), Lake_name_percent = c("AlanHenry0.705", "AlanHenry0.82",
"AlanHenry0.82", "AlanHenry0.912", "AlanHenry0.966", "AlanHenry1.009",
"AmonGCarter0.724", "AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885",
"AmonGCarter0.914", "AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018",
"AmonGCarter1.035"), surface_area = c(2109.35, 2354.878, 2354.878,
2553.1325, 2660.238125, 2779.62076923077, NA, NA, NA, NA, NA,
NA, NA, NA, NA), percent2 = c(0.705, 0.82, 0.82, 0.912, 0.966,
1.009, NA, NA, NA, NA, NA, NA, NA, NA, NA), Lake_name_percent2 = c("AlanHenry0.708",
"AlanHenry0.821", "AlanHenry0.856", "AlanHenry0.886", "AlanHenry0.958",
"AlanHenry0.989", "AmonGCarter0.67", "AmonGCarter0.629", "AmonGCarter0.885",
"AmonGCarter0.842", "AmonGCarter0.958", "AmonGCarter1.035", "AmonGCarter1.075",
"AmonGCarter0.92", "AmonGCarter0.918"), prev.surface_area = c(2116.20333333333,
2355.95, 2426.50833333333, 2505.33642857143, 2637.47595744681,
2694.90068965517, NA, NA, NA, NA, NA, NA, NA, NA, NA), percent3 = c(0.708,
0.821, 0.856, 0.886, 0.958, 0.989, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X = c(277L, 303L, 341L, 243L, 211L, 173L, 155L, 568L,
456L, 244L, 36L, 417L, 58L, 603L, 194L)), class = "data.frame", row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L
))
I am trying to merge this data frame with a data frame that looks like this:
surface_area Lake_name percent2 Lake_name_percent
<dbl> <chr> <dbl> <chr>
1 1159. AmonGCarter 0.724 AmonGCarter0.724
2 1176. AmonGCarter 0.74 AmonGCarter0.74
3 1240. AmonGCarter 0.8 AmonGCarter0.8
4 1329. AmonGCarter 0.885 AmonGCarter0.885
5 1360. AmonGCarter 0.914 AmonGCarter0.914
6 1407. AmonGCarter 0.958 AmonGCarter0.958
7 1426. AmonGCarter 0.976 AmonGCarter0.976
8 1468 AmonGCarter 1.02 AmonGCarter1.018
9 1484. AmonGCarter 1.03 AmonGCarter1.035
> dput(AmonGCarter.sa)
structure(list(surface_area = c(1159.1, 1175.8, 1239.65714285714,
1329.2, 1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), Lake_name = c("AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter"), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent = c("AmonGCarter0.724",
"AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914",
"AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"
)), row.names = c(NA, -9L), groups = structure(list(percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
When I merge the data sets like this:
merge(data,AmonGCarter.sa, all.y = T)
My data looks like this without the AlanHenry data:
> merge(data,AmonGCarter.sa, all.y = T)
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AmonGCarter AmonGCarter0.724 1159.100 0.724 <NA> NA NA NA
2 AmonGCarter AmonGCarter0.74 1175.800 0.740 <NA> NA NA NA
3 AmonGCarter AmonGCarter0.8 1239.657 0.800 <NA> NA NA NA
4 AmonGCarter AmonGCarter0.885 1329.200 0.885 <NA> NA NA NA
5 AmonGCarter AmonGCarter0.914 1359.600 0.914 <NA> NA NA NA
6 AmonGCarter AmonGCarter0.958 1406.704 0.958 <NA> NA NA NA
7 AmonGCarter AmonGCarter0.976 1426.200 0.976 <NA> NA NA NA
8 AmonGCarter AmonGCarter1.018 1468.000 1.018 <NA> NA NA NA
9 AmonGCarter AmonGCarter1.035 1484.200 1.035 <NA> NA NA NA
> dput(merge(data,AmonGCarter.sa, all.y = T))
structure(list(Lake_name = c("AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter"), Lake_name_percent = c("AmonGCarter0.724", "AmonGCarter0.74",
"AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914", "AmonGCarter0.958",
"AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"),
surface_area = c(1159.1, 1175.8, 1239.65714285714, 1329.2,
1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent2 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), prev.surface_area = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), percent3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), X = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), row.names = c(NA, -9L), class = "data.frame")
How do I go about adding the data from AmonGCarter
into a merged dataframe that preserves the previous data from AlanHenry?
CodePudding user response:
You can use full_join
in tidyverse
. This creates additional columns (but we control the names with a suffix), but then we can use coalesce
to combine like columns. Then, we can drop the extra columns.
library(tidyverse)
data %>%
full_join(AmonGCarter.sa, by = 'Lake_name_percent', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~ coalesce(., get(
sub('_x', '_y', cur_column())
)),
.names = '{sub("_x", "", {.col})}')) %>%
select(!ends_with('_x') & !ends_with('_y')) %>%
select(names(data))
Another easy way is to make use of natural_join
from the rquery
package. It will replace the NA
values in data
with the values from AmonGCarter.sa
. Then, to get the columns in the original order as data
, you can use select
from dplyr
.
library(rquery)
library(rqdatatable)
library(dplyr)
rquery::natural_join(data, AmonGCarter.sa, by = "Lake_name_percent", jointype = "FULL") %>%
dplyr::select(names(data))
Output
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
7 AmonGCarter AmonGCarter0.724 1159.100 0.724 AmonGCarter0.67 NA NA 155
8 AmonGCarter AmonGCarter0.74 1175.800 0.740 AmonGCarter0.629 NA NA 568
9 AmonGCarter AmonGCarter0.8 1239.657 0.800 AmonGCarter0.885 NA NA 456
10 AmonGCarter AmonGCarter0.885 1329.200 0.885 AmonGCarter0.842 NA NA 244
11 AmonGCarter AmonGCarter0.914 1359.600 0.914 AmonGCarter0.958 NA NA 36
12 AmonGCarter AmonGCarter0.958 1406.704 0.958 AmonGCarter1.035 NA NA 417
13 AmonGCarter AmonGCarter0.976 1426.200 0.976 AmonGCarter1.075 NA NA 58
14 AmonGCarter AmonGCarter1.018 1468.000 1.018 AmonGCarter0.92 NA NA 603
15 AmonGCarter AmonGCarter1.035 1484.200 1.035 AmonGCarter0.918 NA NA 194