Adapt code to work with dynamic number and name of dataframe columns instead of static ones-CodePudding

I have the data frame below:

rp<-structure(list(ProductFamily = c("PF_1", "PF_10", "PF_10", "PF_100", 
"PF_100", "PF_101", "PF_102", "PF_102", "PF_102", "PF_102", "PF_103", 
"PF_103", "PF_104", "PF_105", "PF_106", "PF_106", "PF_106", "PF_106", 
"PF_107", "PF_108", "PF_109", "PF_11", "PF_110", "PF_110", "PF_111"
), TreatmenArea = c("TA_7", "TA_2", "TA_2", "TA_6", "TA_6", "TA_2", 
"TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", 
"TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_2", "TA_7", 
"TA_2", "TA_2", "TA_7"), Subarea = c("SA_16", "SA_5", "SA_5", 
"SA_15", "SA_15", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", 
"SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", "SA_6", 
"SA_6", "SA_6", "SA_17", "SA_6", "SA_6", "SA_22"), Ind = c("volume_1", 
"volume_5", "volume_6", "volume_52", "volume_3", "volume_16", 
"volume_16", "volume_12", "volume_3", "volume_11", "volume_20", 
"volume_3", "volume_12", "volume_12", "volume_79", "volume_52", 
"volume_1", "volume_24", "volume_16", "volume_16", "volume_16", 
"volume_7", "volume_80", "volume_1", "volume_16"), Country_1 = c(NA, 
NA, NA, NA, NA, 92.6961421759861, NA, NA, NA, NA, 78.3001808318264, 
NA, NA, NA, 106.832963501416, 0.613496932515337, 104.21011973735, 
NA, NA, NA, NA, NA, 99.5238622522423, NA, NA), Country_2 = c(NA, 
NA, 161.55950752394, NA, NA, NA, NA, NA, 59.1346153846154, NA, 
NA, NA, NA, 128.113063407181, 93.3812839543959, NA, NA, NA, 137.724550898204, 
NA, NA, NA, NA, 90.1602849510241, 37.3939722071828)), row.names = c(NA, 
25L), class = "data.frame")

and I want to replace the NAs with the values of a second dataframe rp2 (if exist) by same Country_ and Subarea. For example if we see that Country_1 for Subarea SA16 is NA in rp we go to rp2 and replace it with 108.35096

rp2<-structure(list(TreatmenArea = c("TA_1", "TA_1", "TA_1", "TA_1", 
"TA_2", "TA_2", "TA_2", "TA_3", "TA_4", "TA_4", "TA_5", "TA_5", 
"TA_5", "TA_6", "TA_6", "TA_7", "TA_7", "TA_7", "TA_7", "TA_7", 
"TA_7", "TA_7", "TA_7", "TA_8", "TA_9"), Subarea = c("SA_1", 
"SA_2", "SA_3", "SA_4", "SA_5", "SA_6", "SA_7", "SA_8", "SA_10", 
"SA_9", "SA_11", "SA_12", "SA_13", "SA_14", "SA_15", "SA_16", 
"SA_17", "SA_18", "SA_19", "SA_20", "SA_21", "SA_22", "SA_23", 
"SA_24", "SA_25"), Country_1 = c(101.37519256645, 105.268942332558, 
100.49933368058, 104.531597221684, NA, 83.4404308144341, 86.2833044714836, 
81.808967345926, 79.6786979951661, 77.6863475527052, NA, 78.3001808318264, 
112.499238782021, 113.526674294436, NA, 108.350959378962, NA, 
NA, 102.243471199266, NA, 104.323270355678, NA, NA, NA, 100), 
    Country_2 = c(98.7267717862572, 83.9572019653478, 97.164068306148, 
    103.654771613923, 161.55950752394, 75.4091957339533, 96.5255996196344, 
    99.8317785594128, 88.1477193135348, NA, NA, NA, 151.411687458963, 
    107.652477161141, NA, NA, NA, NA, 92.4695215620261, NA, 88.9890571623243, 
    39.1847909816402, 87.0642912470953, NA, NA)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -25L), groups = structure(list(
    TreatmenArea = c("TA_1", "TA_2", "TA_3", "TA_4", "TA_5", 
    "TA_6", "TA_7", "TA_8", "TA_9"), .rows = structure(list(1:4, 
        5:7, 8L, 9:10, 11:13, 14:15, 16:23, 24L, 25L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L), .drop = TRUE))

I achieve it with:

library(dplyr)

left_join(rp, rp2, by = "Subarea", suffix = c("", ".y")) %>%
  mutate(Country_1 = coalesce(Country_1, Country_1.y),
         Country_2 = coalesce(Country_2, Country_2.y)) %>%
  select(!ends_with(".y"))

But my actual dataset is going to be dynamic and instead of Country_1,Country_2 etc is going to have a numerous number of countries like England,France etc. so I want to adapt the code above to read the dataset names whatever they be instead of Country_1 etc

CodePudding user response：

By bringing the data (i.e. rp and rp2) into long form (with respect to the countries) we can avoid having to specify the countries individually. But since there are multiple entries in rp for the same value in Subarea we have to create a unique ID (rowID) first, so that we can bring the data into the original form at the end.

I am assuming that rp2 has only one row per unique value for Subarea (as in the example you provided).

library(dplyr)
library(tidyr)
rp %>% 
  mutate(rowID = row_number()) %>% 
  pivot_longer(col = -c(rowID, ProductFamily, TreatmenArea, Subarea, Ind ), names_to = "Country", values_to = "Value_1") %>% 
  left_join(rp2 %>% pivot_longer(col = -c(TreatmenArea, Subarea), names_to = "Country", values_to = "Value_2")) %>% 
  mutate(value = coalesce(Value_1, Value_2)) %>% 
  select(-Value_1, -Value_2) %>% 
  pivot_wider(names_from = Country, values_from = value) %>% 
  select(-rowID)