Manipulate function that can be used for any store in R-CodePudding

I have 100K stores that have different column & my task is to find the best sales from each store. My function is like this:

library(qcc)
library(dplyr)
library(purrr)
best.sales <- function(x){
    y <- x$sales
    best.sales.detection <- qcc(as.ts(y), type = "xbar.one", plot = F)
    best.sales.detection.data <- data.frame(ind = best.sales.detection$violations$beyond.limits, sales = y[best.sales.detection$violations$beyond.limits])
    best.sales.detection.conf <- best.sales.detection$limits[2]
    best.sales.detection.indeks <- subset(best.sales.detection.data, sales > best.sales.detection.conf)$ind
    best.sales.detection.result <- x[best.sales.detection.indeks,]
    looping <- best.sales.detection.result
}

I use 2 stores as sample for this question.

Store 1

# Store 1
datex <- rep("2021-11-01", 48)
hourx <- c(0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5)
employee <- c("john","john","john","john","john","john","matt","matt","matt","matt","matt","matt","john","john","john","john","john","john","matt","matt","matt","matt","matt","matt","john","john","john","john","john","john","matt","matt","matt","matt","matt","matt","john","john","john","john","john","john","matt","matt","matt","matt","matt","matt")
country <- c("kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore")
product <- c("shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","shirt","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant","pant")
sales <- c(233,163,685,183,5115,169,1276,239,278,393,3685,176,185,113,5967,320,115,136,1139,259,2960,561,300,2755,307,192,128,173,224,4973,673,5337,500,799,852,774,445,2103,577,3817,452,437,1811,170,740,1551,300,234)
store1 <- data.frame(datex, hourx, employee, country, product, sales)

head(store1)
#       datex hourx employee      country product sales
#1 2021-11-01     0     john kuala lumpur   shirt   233
#2 2021-11-01     1     john kuala lumpur   shirt   163
#3 2021-11-01     2     john kuala lumpur   shirt   685
#4 2021-11-01     3     john kuala lumpur   shirt   183
#5 2021-11-01     4     john kuala lumpur   shirt  5115
#6 2021-11-01     5     john kuala lumpur   shirt   169

Now we find the best sales using this function:

# store sales solution
store.sales <- function(mydata){
  mydata.sales <- na.omit(mydata)
  mydata.sales %>%
    group_by(across(employee:product)) %>%
    ungroup -> mydata.sales2
  mydata.sales2 %>%
    group_split(across(employee:product)) %>%
    map_df(best.sales) -> myresult.sales
  myresult.sales
}
mystore1 <- store.sales(store1)
mystore1

# A tibble: 1 × 6
#  datex      hourx employee country      product sales
#  <chr>      <dbl> <chr>    <chr>        <chr>   <dbl>
#1 2021-11-01     5 john     kuala lumpur pant     4973

Store 2

datex <- rep("2021-11-01", 24)
hourx <- c(0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5)
employee <- c("max","max","max","max","max","max","coco","coco","coco","coco","coco","coco","max","max","max","max","max","max","coco","coco","coco","coco","coco","coco")
country <- c("kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","kuala lumpur","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore","singapore")
sales <- c(350,245,1028,275,1673,254,2914,359,417,590,2528,264,278,170,2951,480,173,204,709,389,440,842,450,7133)
store2 <- data.frame(datex, hourx, employee, country, sales)

head(store2)
#       datex hourx employee      country sales
#1 2021-11-01     0      max kuala lumpur   350
#2 2021-11-01     1      max kuala lumpur   245
#3 2021-11-01     2      max kuala lumpur  1028
#4 2021-11-01     3      max kuala lumpur   275
#5 2021-11-01     4      max kuala lumpur  1673
#6 2021-11-01     5      max kuala lumpur   254

Now we find the best sales using this function:

store.sales <- function(mydata){
  mydata.sales <- na.omit(mydata)
  mydata.sales %>% # Cleansing
    group_by(across(employee:country)) %>%
    ungroup -> mydata.sales2
  mydata.sales2 %>% # Running
    group_split(across(employee:country)) %>%
    map_df(best.sales) -> myresult.sales
  myresult.sales
}
mystore2 <- store.sales(store2)
mystore2

# A tibble: 1 × 5
#  datex      hourx employee country   sales
#  <chr>      <dbl> <chr>    <chr>     <dbl>
#1 2021-11-01     5 coco     singapore  7133

As you can see, beside column datex, hourx & sales, Store1 has 3 columns (employee, country & product) but Store2 only has 2 columns (employee & country). Every store that i have is having different columns, sometime has 10 columns. Now, how do i manipulate line across(the first column: the last column) in store.sales function so it can be used for all store that i have?. Thanks in Advanced.

CodePudding user response：

Solution 1:

If you know what columns in every data frame you don't want to include, you can exclude them as follows.

store.sales_test1 <- function(mydata){
  mydata.sales <- na.omit(mydata)
  mydata.sales %>% # Cleansing
    group_by(across(!c(datex, hourx, sales))) %>%
    ungroup -> mydata.sales2
  mydata.sales2 %>% # Running
    group_split(across(!c(datex, hourx, sales))) %>%
    map_df(best.sales) -> myresult.sales
  myresult.sales
}

store.sales_test1(store1)
# # A tibble: 1 x 6
#   datex      hourx employee country      product sales
#   <chr>      <dbl> <chr>    <chr>        <chr>   <dbl>
# 1 2021-11-01     5 john     kuala lumpur pant     4973

store.sales_test1(store2)
# # A tibble: 1 x 5
#   datex      hourx employee country   sales
#   <chr>      <dbl> <chr>    <chr>     <dbl>
# 1 2021-11-01     5 coco     singapore  7133

Solution 2

If you know the first and last column name you want to include, you can include them as follows.

store.sales_test2 <- function(mydata, col1, col2){
  mydata.sales <- na.omit(mydata)
  cols <- names(mydata)
  cols_select <- tidyselect::vars_select(cols, col1:col2)
  mydata.sales %>% # Cleansing
    group_by(across(all_of(cols_select))) %>%
    ungroup -> mydata.sales2
  mydata.sales2 %>% # Running
    group_split(across(all_of(cols_select))) %>%
    map_df(best.sales) -> myresult.sales
  myresult.sales
}

store.sales_test2(store1, col1 = "employee", col2 = "product")
# # A tibble: 1 x 6
#   datex      hourx employee country      product sales
#   <chr>      <dbl> <chr>    <chr>        <chr>   <dbl>
# 1 2021-11-01     5 john     kuala lumpur pant     4973

store.sales_test2(store2, col1 = "employee", col2 = "country")
# # A tibble: 1 x 5
#   datex      hourx employee country   sales
#   <chr>      <dbl> <chr>    <chr>     <dbl>
# 1 2021-11-01     5 coco     singapore  7133

Solution 3

If you know the first and last column index you want to include, you can include them as follows.

store.sales_test3 <- function(mydata, firstcol = 3, lastcol = 1){
  mydata.sales <- na.omit(mydata)
  mydata.sales %>% # Cleansing
    group_by(across(firstcol:last_col(lastcol))) %>%
    ungroup -> mydata.sales2
  mydata.sales2 %>% # Running
    group_split(across(firstcol:last_col(lastcol))) %>%
    map_df(best.sales) -> myresult.sales
  myresult.sales
}

store.sales_test3(store1)
# # A tibble: 1 x 6
#   datex      hourx employee country      product sales
#   <chr>      <dbl> <chr>    <chr>        <chr>   <dbl>
# 1 2021-11-01     5 john     kuala lumpur pant     4973

store.sales_test3(store2)
# # A tibble: 1 x 5
#   datex      hourx employee country   sales
#   <chr>      <dbl> <chr>    <chr>     <dbl>
# 1 2021-11-01     5 coco     singapore  7133