Home > other >  How do I take a custom function and use it with dplyr group_by in r?
How do I take a custom function and use it with dplyr group_by in r?

Time:03-09

Sorry for asking the same type of question, but I just can't wrap my head around how to take my custom function and iterate it over grouped data. So bonus points if anyone can point me to some in-depth resources.

This works as intended:

library(dplyr)

a_df <- tibble(team = "A",
             cluster = list(
               c(0.01, 0.01, 0.09, 0.03, 0.14, 0.28, 0.09, 0.25, 0.18, 0.17, 0.54, 0.41, 0.16, 0.18, 0.25, 0.02, 0.2, 0.69, 0.02, 0.01, 0.02, 0.07, 0.07, 0.21)),
             tot_matches = 20,
             tot_ck = 121)

#Function that will take a list of xG and return the probability of n goals being scored.Normalized
calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){
  
  #set seed only for stackoverflow question!!!!
  set.seed(123)
  
  #Start goal total at 0
  goals_total <- 0
  
  #Function to take an xG number and simulate whether it was a goal or not
  xg_to_goals_sim <- function(shot_xg){

    #Start goal count at 0
    Goals <- 0

    #For each shot, if it goes in, add a goal
    for (shot in shot_xg){
      if (runif(1)<=shot){
        Goals <- Goals   1
      }
    }

    #Finally, return the number of goals
    return(Goals)

  }
  
  #Run xG calculator 10000 times to test winner %
  sim_goal_list <- c()
  # Create a for statement to populate the list
  for (i in 1:10000) {
    #Run the above formula for xG list
    #But first we need to normalize the number of xG by randomly picking from xG
    
    if(n_matches == norm_matches){
      new_shot_xg_list <- shot_xg_list
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches < norm_matches){
      new_shot_xg_list <- c(shot_xg_list, 
                            sample(shot_xg_list, 
                                   round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list),
                                   replace=FALSE))
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches > norm_matches){
      
      new_shot_xg_list <- sample(shot_xg_list, 
                                 round(length(shot_xg_list)/n_matches*norm_matches,0),
                                 replace=FALSE)
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
  }
  
  
  sim_goal_dat <- data.frame(value = unlist(sim_goal_list))
  
  goal_prob <- sim_goal_dat %>% 
    count(value) %>% 
    summarise(goals = value,
              prob = round(n/1000*10,1)) %>% 
    arrange(goals)
  
  return(goal_prob)
  
  
}

#apply function to a single team dataframe (1 obs. of 4 variables)
calculate_goals_norm(shot_xg_list= unlist(a_df$cluster), n_matches = a_df$tot_matches, norm_matches = 20)

# A tibble: 12 x 2
   goals  prob
   <dbl> <dbl>
 1     0   0.6
 2     1   4  
 3     2  12.4
 4     3  20.6
 5     4  23.8
 6     5  19.1
 7     6  11.6
 8     7   5.6
 9     8   1.8
10     9   0.4
11    10   0.1
12    12   0 

Adding a second team to create a full_df with the intended use of the custom function:

full_df <- add_row(a_df, tibble(team = "B",
                                cluster = list(
                                  c(0.06, 0.01, 0.11, 0.18, 0.75, 0.04, 0.23, 0.07, 0.1, 0.05, 0.24, 0.12, 0.28, 0.02, 0.09, 0.16, 0.64, 0.03, 0.1, 0.19, 0.09, 0.01, 0.02, 0.12, 0.01, 0.11, 0.18, 0.05, 0.02, 0.8, 0.08)),
                                tot_matches = 19,
                                tot_ck = 83) )

#final function should look like this?
full_df %>% 
  group_by(team) %>% 
  calculate_goals_norm(shot_xg_list = unlist(cluster), n_matches = tot_matches, norm_matches = 20)

I'm not married to the custom function looking like the above example. I know apply()/sapply() is commonly used to iterate over dataframes, but again I'm not literate enough to know how to apply here.

Thanks for your help.

CodePudding user response:

We may use group_modify here

library(dplyr)
out <- full_df %>% 
  group_by(team) %>% 
  group_modify(~with(.x, 
    calculate_goals_norm(shot_xg_list = unlist(cluster), 
           n_matches = tot_matches, norm_matches = 20))) %>%
  ungroup

-output

> as.data.frame(out)
   team goals prob
1     A     0  0.6
2     A     1  4.0
3     A     2 12.4
4     A     3 20.6
5     A     4 23.8
6     A     5 19.0
7     A     6 11.6
8     A     7  5.6
9     A     8  1.8
10    A     9  0.4
11    A    10  0.1
12    A    12  0.0
13    B     0  0.0
14    B     1  0.8
15    B     2  3.7
16    B     3 11.1
17    B     4 18.6
18    B     5 22.4
19    B     6 19.6
20    B     7 12.4
21    B     8  6.9
22    B     9  2.9
23    B    10  1.1
24    B    11  0.3
25    B    12  0.1
26    B    14  0.0

Note that the function calculate_goals_norm is giving errors at the

sim_goal_list <- as_tibble(sim_goal_list)

So, it is modified to

sim_goal_dat <-  data.frame(value = unlist(sim_goal_list));

-full function


calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){
  
  #set seed only for stackoverflow question!!!!
  set.seed(123)
  
  #Start goal total at 0
  goals_total <- 0
  
  #Function to take an xG number and simulate whether it was a goal or not
  xg_to_goals_sim <- function(shot_xg){

    #Start goal count at 0
    Goals <- 0

    #For each shot, if it goes in, add a goal
    for (shot in shot_xg){
      if (runif(1)<=shot){
        Goals <- Goals   1
      }
    }

    #Finally, return the number of goals
    return(Goals)

  }
  
  #Run xG calculator 10000 times to test winner %
  sim_goal_list <- c()
  # Create a for statement to populate the list
  for (i in 1:10000) {
    #Run the above formula for xG list
    #But first we need to normalize the number of xG by randomly picking from xG
    
    if(n_matches == norm_matches){
      new_shot_xg_list <- shot_xg_list
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches < norm_matches){
      new_shot_xg_list <- c(shot_xg_list, 
                            sample(shot_xg_list, 
                                   round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list),
                                   replace=FALSE))
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches > norm_matches){
      
      new_shot_xg_list <- sample(shot_xg_list, 
                                 round(length(shot_xg_list)/n_matches*norm_matches,0),
                                 replace=FALSE)
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
  }
  
  sim_goal_dat <-  data.frame(value = unlist(sim_goal_list))
  
    goal_prob <- sim_goal_dat %>% 
      count(value) %>% 
      summarise(goals = value,
                prob = round(n/1000*10,1)) %>% 
      arrange(goals)
  
    return(goal_prob)
  
  
  
  
  
  
}
  • Related