Sorry for asking the same type of question, but I just can't wrap my head around how to take my custom function and iterate it over grouped data. So bonus points if anyone can point me to some in-depth resources.
This works as intended:
library(dplyr)
a_df <- tibble(team = "A",
cluster = list(
c(0.01, 0.01, 0.09, 0.03, 0.14, 0.28, 0.09, 0.25, 0.18, 0.17, 0.54, 0.41, 0.16, 0.18, 0.25, 0.02, 0.2, 0.69, 0.02, 0.01, 0.02, 0.07, 0.07, 0.21)),
tot_matches = 20,
tot_ck = 121)
#Function that will take a list of xG and return the probability of n goals being scored.Normalized
calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){
#set seed only for stackoverflow question!!!!
set.seed(123)
#Start goal total at 0
goals_total <- 0
#Function to take an xG number and simulate whether it was a goal or not
xg_to_goals_sim <- function(shot_xg){
#Start goal count at 0
Goals <- 0
#For each shot, if it goes in, add a goal
for (shot in shot_xg){
if (runif(1)<=shot){
Goals <- Goals 1
}
}
#Finally, return the number of goals
return(Goals)
}
#Run xG calculator 10000 times to test winner %
sim_goal_list <- c()
# Create a for statement to populate the list
for (i in 1:10000) {
#Run the above formula for xG list
#But first we need to normalize the number of xG by randomly picking from xG
if(n_matches == norm_matches){
new_shot_xg_list <- shot_xg_list
goals_total <- xg_to_goals_sim(new_shot_xg_list)
sim_goal_list[[i]] <- goals_total
}
if(n_matches < norm_matches){
new_shot_xg_list <- c(shot_xg_list,
sample(shot_xg_list,
round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list),
replace=FALSE))
goals_total <- xg_to_goals_sim(new_shot_xg_list)
sim_goal_list[[i]] <- goals_total
}
if(n_matches > norm_matches){
new_shot_xg_list <- sample(shot_xg_list,
round(length(shot_xg_list)/n_matches*norm_matches,0),
replace=FALSE)
goals_total <- xg_to_goals_sim(new_shot_xg_list)
sim_goal_list[[i]] <- goals_total
}
}
sim_goal_dat <- data.frame(value = unlist(sim_goal_list))
goal_prob <- sim_goal_dat %>%
count(value) %>%
summarise(goals = value,
prob = round(n/1000*10,1)) %>%
arrange(goals)
return(goal_prob)
}
#apply function to a single team dataframe (1 obs. of 4 variables)
calculate_goals_norm(shot_xg_list= unlist(a_df$cluster), n_matches = a_df$tot_matches, norm_matches = 20)
# A tibble: 12 x 2
goals prob
<dbl> <dbl>
1 0 0.6
2 1 4
3 2 12.4
4 3 20.6
5 4 23.8
6 5 19.1
7 6 11.6
8 7 5.6
9 8 1.8
10 9 0.4
11 10 0.1
12 12 0
Adding a second team to create a full_df
with the intended use of the custom function:
full_df <- add_row(a_df, tibble(team = "B",
cluster = list(
c(0.06, 0.01, 0.11, 0.18, 0.75, 0.04, 0.23, 0.07, 0.1, 0.05, 0.24, 0.12, 0.28, 0.02, 0.09, 0.16, 0.64, 0.03, 0.1, 0.19, 0.09, 0.01, 0.02, 0.12, 0.01, 0.11, 0.18, 0.05, 0.02, 0.8, 0.08)),
tot_matches = 19,
tot_ck = 83) )
#final function should look like this?
full_df %>%
group_by(team) %>%
calculate_goals_norm(shot_xg_list = unlist(cluster), n_matches = tot_matches, norm_matches = 20)
I'm not married to the custom function looking like the above example. I know apply()/sapply() is commonly used to iterate over dataframes, but again I'm not literate enough to know how to apply here.
Thanks for your help.
CodePudding user response:
We may use group_modify
here
library(dplyr)
out <- full_df %>%
group_by(team) %>%
group_modify(~with(.x,
calculate_goals_norm(shot_xg_list = unlist(cluster),
n_matches = tot_matches, norm_matches = 20))) %>%
ungroup
-output
> as.data.frame(out)
team goals prob
1 A 0 0.6
2 A 1 4.0
3 A 2 12.4
4 A 3 20.6
5 A 4 23.8
6 A 5 19.0
7 A 6 11.6
8 A 7 5.6
9 A 8 1.8
10 A 9 0.4
11 A 10 0.1
12 A 12 0.0
13 B 0 0.0
14 B 1 0.8
15 B 2 3.7
16 B 3 11.1
17 B 4 18.6
18 B 5 22.4
19 B 6 19.6
20 B 7 12.4
21 B 8 6.9
22 B 9 2.9
23 B 10 1.1
24 B 11 0.3
25 B 12 0.1
26 B 14 0.0
Note that the function calculate_goals_norm
is giving errors at the
sim_goal_list <- as_tibble(sim_goal_list)
So, it is modified to
sim_goal_dat <- data.frame(value = unlist(sim_goal_list));
-full function
calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){
#set seed only for stackoverflow question!!!!
set.seed(123)
#Start goal total at 0
goals_total <- 0
#Function to take an xG number and simulate whether it was a goal or not
xg_to_goals_sim <- function(shot_xg){
#Start goal count at 0
Goals <- 0
#For each shot, if it goes in, add a goal
for (shot in shot_xg){
if (runif(1)<=shot){
Goals <- Goals 1
}
}
#Finally, return the number of goals
return(Goals)
}
#Run xG calculator 10000 times to test winner %
sim_goal_list <- c()
# Create a for statement to populate the list
for (i in 1:10000) {
#Run the above formula for xG list
#But first we need to normalize the number of xG by randomly picking from xG
if(n_matches == norm_matches){
new_shot_xg_list <- shot_xg_list
goals_total <- xg_to_goals_sim(new_shot_xg_list)
sim_goal_list[[i]] <- goals_total
}
if(n_matches < norm_matches){
new_shot_xg_list <- c(shot_xg_list,
sample(shot_xg_list,
round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list),
replace=FALSE))
goals_total <- xg_to_goals_sim(new_shot_xg_list)
sim_goal_list[[i]] <- goals_total
}
if(n_matches > norm_matches){
new_shot_xg_list <- sample(shot_xg_list,
round(length(shot_xg_list)/n_matches*norm_matches,0),
replace=FALSE)
goals_total <- xg_to_goals_sim(new_shot_xg_list)
sim_goal_list[[i]] <- goals_total
}
}
sim_goal_dat <- data.frame(value = unlist(sim_goal_list))
goal_prob <- sim_goal_dat %>%
count(value) %>%
summarise(goals = value,
prob = round(n/1000*10,1)) %>%
arrange(goals)
return(goal_prob)
}