I have 100K stores that have different column & my task is to find the best sales from each store. My function is like this:
library(qcc)
library(dplyr)
library(purrr)
best.sales <- function(x){
y <- x$sales
best.sales.detection <- qcc(as.ts(y), type = "xbar.one", plot = F)
best.sales.detection.data <- data.frame(ind = best.sales.detection$violations$beyond.limits, sales = y[best.sales.detection$violations$beyond.limits])
best.sales.detection.conf <- best.sales.detection$limits[2]
best.sales.detection.indeks <- subset(best.sales.detection.data, sales > best.sales.detection.conf)$ind
best.sales.detection.result <- x[best.sales.detection.indeks,]
looping <- best.sales.detection.result
}
I use 2 stores as sample for this question.
- Store 1
# Store 1
datex <- rep("2021-11-01", 48)
hourx <- c(0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5)
employee <- c("john","john","john","john","john","john","matt","matt","matt","matt","matt","matt","john","john","john","john","john","john","matt","matt","matt","matt","matt","matt","john","john","john","john","john","john","matt","matt","matt","matt","matt","matt","john","john","john","john","john","john","matt","matt","matt","matt","matt","matt")
country <- c("kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore")
product <- c("shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant")
sales <- c(233,163,685,183,5115,169,1276,239,278,393,3685,176,185,113,5967,320,115,136,1139,259,2960,561,300,2755,307,192,128,173,224,4973,673,5337,500,799,852,774,445,2103,577,3817,452,437,1811,170,740,1551,300,234)
store1 <- data.frame(datex, hourx, employee, country, product, sales)
head(store1)
# datex hourx employee country product sales
#1 2021-11-01 0 john kuala lumpur shirt 233
#2 2021-11-01 1 john kuala lumpur shirt 163
#3 2021-11-01 2 john kuala lumpur shirt 685
#4 2021-11-01 3 john kuala lumpur shirt 183
#5 2021-11-01 4 john kuala lumpur shirt 5115
#6 2021-11-01 5 john kuala lumpur shirt 169
Now we find the best sales using this function:
# store sales solution
store.sales <- function(mydata){
mydata.sales <- na.omit(mydata)
mydata.sales %>%
group_by(across(employee:product)) %>%
ungroup -> mydata.sales2
mydata.sales2 %>%
group_split(across(employee:product)) %>%
map_df(best.sales) -> myresult.sales
myresult.sales
}
mystore1 <- store.sales(store1)
mystore1
# A tibble: 1 × 6
# datex hourx employee country product sales
# <chr> <dbl> <chr> <chr> <chr> <dbl>
#1 2021-11-01 5 john kuala lumpur pant 4973
- Store 2
datex <- rep("2021-11-01", 24)
hourx <- c(0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5)
employee <- c("max","max","max","max","max","max","coco","coco","coco","coco","coco","coco","max","max","max","max","max","max","coco","coco","coco","coco","coco","coco")
country <- c("kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore")
sales <- c(350,245,1028,275,1673,254,2914,359,417,590,2528,264,278,170,2951,480,173,204,709,389,440,842,450,7133)
store2 <- data.frame(datex, hourx, employee, country, sales)
head(store2)
# datex hourx employee country sales
#1 2021-11-01 0 max kuala lumpur 350
#2 2021-11-01 1 max kuala lumpur 245
#3 2021-11-01 2 max kuala lumpur 1028
#4 2021-11-01 3 max kuala lumpur 275
#5 2021-11-01 4 max kuala lumpur 1673
#6 2021-11-01 5 max kuala lumpur 254
Now we find the best sales using this function:
store.sales <- function(mydata){
mydata.sales <- na.omit(mydata)
mydata.sales %>% # Cleansing
group_by(across(employee:country)) %>%
ungroup -> mydata.sales2
mydata.sales2 %>% # Running
group_split(across(employee:country)) %>%
map_df(best.sales) -> myresult.sales
myresult.sales
}
mystore2 <- store.sales(store2)
mystore2
# A tibble: 1 × 5
# datex hourx employee country sales
# <chr> <dbl> <chr> <chr> <dbl>
#1 2021-11-01 5 coco singapore 7133
As you can see, beside column datex, hourx & sales, Store1 has 3 columns (employee, country & product) but Store2 only has 2 columns (employee & country). Every store that i have is having different columns, sometime has 10 columns. Now, how do i manipulate line across(the first column: the last column) in store.sales function so it can be used for all store that i have?. Thanks in Advanced.
CodePudding user response:
Solution 1:
If you know what columns in every data frame you don't want to include, you can exclude them as follows.
store.sales_test1 <- function(mydata){
mydata.sales <- na.omit(mydata)
mydata.sales %>% # Cleansing
group_by(across(!c(datex, hourx, sales))) %>%
ungroup -> mydata.sales2
mydata.sales2 %>% # Running
group_split(across(!c(datex, hourx, sales))) %>%
map_df(best.sales) -> myresult.sales
myresult.sales
}
store.sales_test1(store1)
# # A tibble: 1 x 6
# datex hourx employee country product sales
# <chr> <dbl> <chr> <chr> <chr> <dbl>
# 1 2021-11-01 5 john kuala lumpur pant 4973
store.sales_test1(store2)
# # A tibble: 1 x 5
# datex hourx employee country sales
# <chr> <dbl> <chr> <chr> <dbl>
# 1 2021-11-01 5 coco singapore 7133
Solution 2
If you know the first and last column name you want to include, you can include them as follows.
store.sales_test2 <- function(mydata, col1, col2){
mydata.sales <- na.omit(mydata)
cols <- names(mydata)
cols_select <- tidyselect::vars_select(cols, col1:col2)
mydata.sales %>% # Cleansing
group_by(across(all_of(cols_select))) %>%
ungroup -> mydata.sales2
mydata.sales2 %>% # Running
group_split(across(all_of(cols_select))) %>%
map_df(best.sales) -> myresult.sales
myresult.sales
}
store.sales_test2(store1, col1 = "employee", col2 = "product")
# # A tibble: 1 x 6
# datex hourx employee country product sales
# <chr> <dbl> <chr> <chr> <chr> <dbl>
# 1 2021-11-01 5 john kuala lumpur pant 4973
store.sales_test2(store2, col1 = "employee", col2 = "country")
# # A tibble: 1 x 5
# datex hourx employee country sales
# <chr> <dbl> <chr> <chr> <dbl>
# 1 2021-11-01 5 coco singapore 7133
Solution 3
If you know the first and last column index you want to include, you can include them as follows.
store.sales_test3 <- function(mydata, firstcol = 3, lastcol = 1){
mydata.sales <- na.omit(mydata)
mydata.sales %>% # Cleansing
group_by(across(firstcol:last_col(lastcol))) %>%
ungroup -> mydata.sales2
mydata.sales2 %>% # Running
group_split(across(firstcol:last_col(lastcol))) %>%
map_df(best.sales) -> myresult.sales
myresult.sales
}
store.sales_test3(store1)
# # A tibble: 1 x 6
# datex hourx employee country product sales
# <chr> <dbl> <chr> <chr> <chr> <dbl>
# 1 2021-11-01 5 john kuala lumpur pant 4973
store.sales_test3(store2)
# # A tibble: 1 x 5
# datex hourx employee country sales
# <chr> <dbl> <chr> <chr> <dbl>
# 1 2021-11-01 5 coco singapore 7133