Home > Back-end >  Dynamically subset large dataframe in R based on unique values in two columns into smaller uniquely
Dynamically subset large dataframe in R based on unique values in two columns into smaller uniquely

Time:01-24

I have a dataframe and I would like find a way of creating a number of smaller dataframes based on the combination of unique values in my original data frame.

In the example data below, I would like to find all the unique combinations of "SalesType" and "LeadSale" (in this example it is "Retail and Jessica", "Retail and Mark", "Market and Jessica" and "Market and Mark") and then create new dataframes with only that subsetted information (expected output below).

Input:

structure(list(BusinessDate = structure(c(1672318800, 1672318800, 
1672318800, 1672318800, 1672318800, 1672318800, 1672318800, 1672318800, 
1672318800, 1672318800, 1672318800, 1672318800, 1672318800, 1672318800, 
1672318800, 1672318800, 1672318800, 1672318800, 1672318800, 1672318800
), class = c("POSIXct", "POSIXt"), tzone = ""), ReportType = c("Sales", 
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales", 
"Sales", "Sales", "Sales", "Sales", "Sales", "Sales", "Sales", 
"Sales", "Sales", "Sales", "Sales", "Sales"), SalesType = c("Retail", 
"Retail", "Online", "Retail", "Online", "Online", 
"Market", "Market", "Retail", "Online", "Market", 
"Online", "Retail", "Retail", "Retail", "Retail", 
"Market", "Market", "Online", "Online"),  Currency = c("USD", "USD", "USD", "USD", "USD", "USD", 
"USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", "USD", 
"USD", "USD", "USD", "USD", "USD"), LeadSale = c("Mark", "Mark", 
"Mark", "Mark", "Jessica", "Jessica", "Jessica", "Mark", "Jessica", 
"Jessica", "Jessica", "Jessica", "Mark", "Mark", "Mark", "Mark", 
"Mark", "Mark", "Mark", "Mark"), Value = c(19, 189, 0, 
0, 236, 36, 81, 19, 
34, 12, 12, 12, 
45.5, 45.5, 45.5, 45.5, 
45.5, 45.5, 45.5, 0)), row.names = c(NA, 
20L), class = "data.frame")

Expected Output:

output_market_jessica <-
structure(list(BusinessDate = structure(c(1672318800, 1672318800, 
1672318800, 1672318800, 1672318800, 1672318800), tzone = "", class = c("POSIXct", 
"POSIXt")), ReportType = c("Sales", "Sales", "Sales", "Sales", 
"Sales", "Sales"), SalesType = c("Market", "Market", "Market", 
"Market", "Market", "Market"), Currency = c("USD", "USD", "USD", 
"USD", "USD", "USD"), LeadSale = c("Jessica", "Jessica", "Jessica", 
"Jessica", "Jessica", "Jessica"), Value = c(236, 36, 81, 12, 
12, 12)), row.names = c(NA, -6L), class = "data.frame")

output_market_mark <-
structure(list(BusinessDate = structure(c(1672318800, 1672318800, 
1672318800, 1672318800, 1672318800, 1672318800), tzone = "", class = c("POSIXct", 
"POSIXt")), ReportType = c("Sales", "Sales", "Sales", "Sales", 
"Sales", "Sales"), SalesType = c("Market", "Market", "Market", 
"Market", "Market", "Market"), Currency = c("USD", "USD", "USD", 
"USD", "USD", "USD"), LeadSale = c("Mark", "Mark", "Mark", "Mark", 
"Mark", "Mark"), Value = c(0, 19, 45.5, 45.5, 45.5, 0)), row.names = c(NA, 
-6L), class = "data.frame")

output_retail_jessica <-
structure(list(BusinessDate = structure(1672318800, tzone = "", class = c("POSIXct", 
"POSIXt")), ReportType = "Sales", SalesType = "Retail", Currency = "USD", 
    LeadSale = "Jessica", Value = 34), row.names = c(NA, -1L), class = "data.frame")

output_retail_mark <-
structure(list(BusinessDate = structure(c(1672318800, 1672318800, 
1672318800, 1672318800, 1672318800, 1672318800, 1672318800), tzone = "", class = c("POSIXct", 
"POSIXt")), ReportType = c("Sales", "Sales", "Sales", "Sales", 
"Sales", "Sales", "Sales"), SalesType = c("Retail", "Retail", 
"Retail", "Retail", "Retail", "Retail", "Retail"), Currency = c("USD", 
"USD", "USD", "USD", "USD", "USD", "USD"), LeadSale = c("Mark", 
"Mark", "Mark", "Mark", "Mark", "Mark", "Mark"), Value = c(19, 
189, 0, 45.5, 45.5, 45.5, 45.5)), row.names = c(NA, -7L), class = "data.frame")

CodePudding user response:

One approach. This is a generalized function (except subset_name) for any datframe and by-groups.

Note: The new dataframe are assigned to global environment.

get_subset_df <- function(df,by){
  keys <- df%>%
    group_by_at(by)%>%
    group_keys
  
  for(i in 1:nrow(keys)){
    subset_key <- keys[i,]
    subset_name <- paste(c("Output",keys[i,]),collapse="_")
    
    assign(subset_name,df%>%right_join(subset_key),envir=.GlobalEnv)
  }
}

get_subset_df(df,c("SalesType","LeadSale"))

ls()
[1] "df"                    "get_subset_df"         "Output_Market_Jessica"
[4] "Output_Market_Mark"    "Output_Online_Jessica" "Output_Online_Mark"   
[7] "Output_Retail_Jessica" "Output_Retail_Mark"   
  • Related