I have the data frame below:
rp<-structure(list(ProductFamily = c("PF_1", "PF_10", "PF_10", "PF_100",
"PF_100", "PF_101", "PF_102", "PF_102", "PF_102", "PF_102", "PF_103",
"PF_103", "PF_104", "PF_105", "PF_106", "PF_106", "PF_106", "PF_106",
"PF_107", "PF_108", "PF_109", "PF_11", "PF_110", "PF_110", "PF_111"
), TreatmenArea = c("TA_7", "TA_2", "TA_2", "TA_6", "TA_6", "TA_2",
"TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2",
"TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_7",
"TA_2", "TA_2", "TA_7"), Subarea = c("SA_16", "SA_5", "SA_5",
"SA_15", "SA_15", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6",
"SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6",
"SA_6", "SA_6", "SA_17", "SA_6", "SA_6", "SA_22"), Ind = c("volume_1",
"volume_5", "volume_6", "volume_52", "volume_3", "volume_16",
"volume_16", "volume_12", "volume_3", "volume_11", "volume_20",
"volume_3", "volume_12", "volume_12", "volume_79", "volume_52",
"volume_1", "volume_24", "volume_16", "volume_16", "volume_16",
"volume_7", "volume_80", "volume_1", "volume_16"), Country_1 = c(NA,
NA, NA, NA, NA, 92.6961421759861, NA, NA, NA, NA, 78.3001808318264,
NA, NA, NA, 106.832963501416, 0.613496932515337, 104.21011973735,
NA, NA, NA, NA, NA, 99.5238622522423, NA, NA), Country_2 = c(NA,
NA, 161.55950752394, NA, NA, NA, NA, NA, 59.1346153846154, NA,
NA, NA, NA, 128.113063407181, 93.3812839543959, NA, NA, NA, 137.724550898204,
NA, NA, NA, NA, 90.1602849510241, 37.3939722071828)), row.names = c(NA,
25L), class = "data.frame")
and I want to replace the NAs
with the values of a second dataframe rp2
(if exist) by same Country_
and Subarea
. For example if we see that Country_1
for Subarea SA16
is NA
in rp
we go to rp2
and replace it with 108.35096
rp2<-structure(list(TreatmenArea = c("TA_1", "TA_1", "TA_1", "TA_1",
"TA_2", "TA_2", "TA_2", "TA_3", "TA_4", "TA_4", "TA_5", "TA_5",
"TA_5", "TA_6", "TA_6", "TA_7", "TA_7", "TA_7", "TA_7", "TA_7",
"TA_7", "TA_7", "TA_7", "TA_8", "TA_9"), Subarea = c("SA_1",
"SA_2", "SA_3", "SA_4", "SA_5", "SA_6", "SA_7", "SA_8", "SA_10",
"SA_9", "SA_11", "SA_12", "SA_13", "SA_14", "SA_15", "SA_16",
"SA_17", "SA_18", "SA_19", "SA_20", "SA_21", "SA_22", "SA_23",
"SA_24", "SA_25"), Country_1 = c(101.37519256645, 105.268942332558,
100.49933368058, 104.531597221684, NA, 83.4404308144341, 86.2833044714836,
81.808967345926, 79.6786979951661, 77.6863475527052, NA, 78.3001808318264,
112.499238782021, 113.526674294436, NA, 108.350959378962, NA,
NA, 102.243471199266, NA, 104.323270355678, NA, NA, NA, 100),
Country_2 = c(98.7267717862572, 83.9572019653478, 97.164068306148,
103.654771613923, 161.55950752394, 75.4091957339533, 96.5255996196344,
99.8317785594128, 88.1477193135348, NA, NA, NA, 151.411687458963,
107.652477161141, NA, NA, NA, NA, 92.4695215620261, NA, 88.9890571623243,
39.1847909816402, 87.0642912470953, NA, NA)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -25L), groups = structure(list(
TreatmenArea = c("TA_1", "TA_2", "TA_3", "TA_4", "TA_5",
"TA_6", "TA_7", "TA_8", "TA_9"), .rows = structure(list(1:4,
5:7, 8L, 9:10, 11:13, 14:15, 16:23, 24L, 25L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L), .drop = TRUE))
I achieve it with:
library(dplyr)
left_join(rp, rp2, by = "Subarea", suffix = c("", ".y")) %>%
mutate(Country_1 = coalesce(Country_1, Country_1.y),
Country_2 = coalesce(Country_2, Country_2.y)) %>%
select(!ends_with(".y"))
But my actual dataset is going to be dynamic and instead of Country_1,Country_2
etc is going to have a numerous number of countries like England,France
etc. so I want to adapt the code above to read the dataset names whatever they be instead of Country_1
etc
CodePudding user response:
By bringing the data (i.e. rp
and rp2
) into long form (with respect to the countries) we can avoid having to specify the countries individually. But since there are multiple entries in rp
for the same value in Subarea
we have to create a unique ID (rowID
) first, so that we can bring the data into the original form at the end.
I am assuming that rp2
has only one row per unique value for Subarea
(as in the example you provided).
library(dplyr)
library(tidyr)
rp %>%
mutate(rowID = row_number()) %>%
pivot_longer(col = -c(rowID, ProductFamily, TreatmenArea, Subarea, Ind ), names_to = "Country", values_to = "Value_1") %>%
left_join(rp2 %>% pivot_longer(col = -c(TreatmenArea, Subarea), names_to = "Country", values_to = "Value_2")) %>%
mutate(value = coalesce(Value_1, Value_2)) %>%
select(-Value_1, -Value_2) %>%
pivot_wider(names_from = Country, values_from = value) %>%
select(-rowID)