R: Creating a Function For Calculating Conditional Probabilities-CodePudding

I am working with the R programming language.

I have the following data - suppose this contains the "exam results" for different students (same ID corresponds to the same student) taken at different times:

id = sample.int(10000, 100000, replace = TRUE)
res = c("PASS", "FAIL")
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)

my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]

      id results date_exam_taken
43894  1    FAIL      2001-06-18
31309  1    FAIL      2001-10-21
1996   1    FAIL      2004-08-21
76256  1    PASS      2004-10-13
14043  1    PASS      2005-05-11
38423  1    FAIL      2006-06-10

I want to answer the following question - based on this data, given a that student failed their 3rd exam, what is the probability that a student will pass their 4th exam and what is the probability that this student will fail their 4th exam?

In other words - given the result of the nth exam, what is the probability of pass/fail their n 1 th exam?

I tried to answer this in the following way:

my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL

third_exam = my_data[which(my_data$exam_number == 3), ]
third_exam = third_exam[which(third_exam$results == "FAIL"), ]
fourth_exam = my_data[which(my_data$exam_number == 4), ]

merged = merge(x = third_exam, y = fourth_exam, by = "id", all = TRUE)
merged = na.omit(merged)

pass = merged[merged$results.x == 'FAIL' & merged$results.y  == "PASS", ]
fail = merged[merged$results.x == 'FAIL' & merged$results.y  == "FAIL", ]

pass_prob = nrow(pass)/(nrow(pass)   nrow(fail))
fail_prob = nrow(fail)/(nrow(pass)   nrow(fail))

I tried to make this into a function for the future:

my_function <- function(current_exam, next_exam, result_of_current_exam)
    
{

my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
    
    c_exam = my_data[which(my_data$exam_number == current_exam), ]
    c_exam = c_exam[which(c_exam$results == result_of_current_exam), ]
    n_exam = my_data[which(my_data$exam_number == next_exam), ]
    
    merged = merge(x = c_exam, y = n_exam, by = "id", all = TRUE)
    merged = na.omit(merged)
    
    pass = merged[merged$results.x == result_of_current_exam & merged$results.y  == "PASS", ]
    fail = merged[merged$results.x == result_of_current_exam & merged$results.y  == "FAIL", ]
    
    pass_prob = nrow(pass)/(nrow(pass)   nrow(fail))
    fail_prob = nrow(fail)/(nrow(pass)   nrow(fail))
    
    return(c(pass_prob, fail_prob))
    
}

Now to call the function - given a student passed the third exam, what are the probabilities of passing and failing the fourth exam?

> my_function("3","4", "PASS")
[1] 0.5126595 0.4873405

I am now trying to run this function for all consecutive combinations (e.g. probabilities for the results of 2nd exam given 1st, probabilities for the results of 3rd exam given 2nd, probabilities for the results of 4th exam given 3rd, etc.).
I would also be interested in extending this function - given the results of the first and the second exam (e.g. FAIL, FAIL), what are the probabilities for the results of the third exam?

Is there a quick way to apply my function (assuming I have written this function correctly) for all these combinations?

Can someone please show me how to do this correctly?

Thanks!

CodePudding user response：

I attempted your first query regarding running the function for all consecutive combinations. Hope this is useful

current<-unique(my_data$exam_number)
next_ex<-current[-1]
current<-current[-length(current)]

library(tidyverse)
pmap(list(.x=current,.y=next_ex),
     ~my_function(current_exam=.x,
                  next_exam=.y,
                  result_of_current_exam="PASS"))

pmap(list(.x=current,.y=next_ex),
     ~my_function(current_exam=.x,
                  next_exam=.y,
                  result_of_current_exam="FAIL"))

CodePudding user response：

Using data.table, we can simply shift the results and aggregate by test number:

library(data.table)

setkey(dt, id, date_exam_taken)[
  ,`:=`(next_result = shift(results, -1) == "PASS", exam_num = 1:.N), id
][
  ,.(prob_pass_next = mean(next_result, na.rm = TRUE), samples = sum(!is.na(next_result))), exam_num
]
#>     exam_num prob_pass_next samples
#>  1:        1      0.5023009    9996
#>  2:        2      0.4975942    9976
#>  3:        3      0.5000000    9888
#>  4:        4      0.5015445    9712
#>  5:        5      0.5084999    9353
#>  6:        6      0.5005727    8730
#>  7:        7      0.4981422    7805
#>  8:        8      0.4960108    6643
#>  9:        9      0.5122585    5384
#> 10:       10      0.5061283    4161
#> 11:       11      0.5117046    3033
#> 12:       12      0.4874396    2070
#> 13:       13      0.4886113    1361
#> 14:       14      0.4888628     853
#> 15:       15      0.4990215     511
#> 16:       16      0.5095057     263
#> 17:       17      0.4809160     131
#> 18:       18      0.6086957      69
#> 19:       19      0.5128205      39
#> 20:       20      0.6666667      15
#> 21:       21      1.0000000       3
#> 22:       22      0.5000000       2
#> 23:       23      0.0000000       2
#> 24:       24      0.0000000       1
#> 25:       25            NaN       0
#>     exam_num prob_pass_next samples

Data:

set.seed(1238818837)

dt <- data.table(
  id = sample.int(10000, 100000, replace = TRUE),
  results = sample(c("PASS", "FAIL"), 100000, replace = TRUE),
  date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
)