Home > Back-end >  aggregate data and calculate confidence intervals for sub cohort in R
aggregate data and calculate confidence intervals for sub cohort in R

Time:08-30

I have a dataset

testData <- structure(list(group = c("Group1", "Group1", "Group1", "Group1", 
                                 "Group1", "Group1", "Group1", "Group1", "Group1", "Group1", "Group1", 
                                 "Group1", "Group1", "Group1", "Group1", "Group1", "Group1", "Group1", 
                                 "Group1", "Group1", "Group1", "Group1", "Group1", "Group1", "Group1", 
                                 "Group1", "Group1", "Group1", "Group1", "Group1", "Group2", "Group2", 
                                 "Group2", "Group2", "Group2", "Group2", "Group2", "Group2", "Group2", 
                                 "Group2", "Group2", "Group2", "Group2", "Group2", "Group2", "Group2", 
                                 "Group2", "Group2", "Group2", "Group2", "Group2", "Group2"), 
                       year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 
                                2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 
                                2016, 2016, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 
                                2017, 2017, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 
                                2016, 2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 
                                2017, 2017, 2017, 2017), category = c("cat1", "cat1", "cat1", 
                                                                      "cat1", "cat1", "cat2", "cat2", "cat2", "cat2", "cat2", "cat1", 
                                                                      "cat1", "cat1", "cat1", "cat1", "cat2", "cat2", "cat2", "cat2", 
                                                                      "cat2", "cat1", "cat1", "cat1", "cat1", "cat1", "cat2", "cat2", 
                                                                      "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", 
                                                                      "cat2", "cat2", "cat3", "cat3", "cat3", "cat3", "cat3", "cat3", 
                                                                      "cat3", "cat3", "cat3", "cat3", "cat3", "cat1", "cat1", "cat1", 
                                                                      "cat1"), value = c(30.1660205462388, 96.1649663179749, 183.691571800985, 
                                                                                         1.65328912643215, 9.30044741412784, 182.449748512614, 8.47095574122154, 
                                                                                         23.3081277048748, 53.1188233968077, 34.250829201039, 50.5445297997031, 
                                                                                         120.307165280983, 140.223343284331, 122.319359028798, 43.0193263100948, 
                                                                                         134.417238652291, 106.437343685401, 84.0446901587849, 69.7099679759042, 
                                                                                         132.101156129094, 27.8329259333861, 58.4953521410472, 100.379478360197, 
                                                                                         77.2357869871934, 200.464054913284, 47.6252352008202, 109.598360734847, 
                                                                                         18.1730751285375, 67.5769989539879, 26.7504753716622, 16.8630228114074, 
                                                                                         75.2053705357279, 39.7641860921024, 126.658782796637, 64.8507816634371, 
                                                                                         96.3471066298501, 61.4392604693245, 27.6801895514785, 181.599217867455, 
                                                                                         11.1036117561468, 68.1516849014302, 115.899355317842, 167.032368398535, 
                                                                                         116.634854779718, 144.080455202308, 186.627050299051, 72.3807151133032, 
                                                                                         37.6345953992576, 2.09517321452513, 58.3682650864716, 54.3590148062561, 
                                                                                         53.9884625670805)), row.names = c(NA, -52L), class = c("data.table", 
                                                                                                                                                "data.frame"))

and I want to aggregate the data on different levels, and calculate the confidence intervals for value at the corresponding aggregation levels. For example, I defined two versions of factor which should be used for the aggregation:

cohort1 = c("group" ,"category", "year")
cohort2 = c("group" ,"category")

and I have written a function to calculate the confidence intervals:

calculateCI <- function(value){
  
  avg <- mean(value)
  s <- sqrt(var(value))
  n <- length(value)
  
  error <- qnorm(0.975)*s/sqrt(n)
  
  lower <- avg - error
  upper <- avg   error 
  
  return(list(lowerCI = lower, 
              upperCI = upper))
  
}

How can I aggregate the data and calculate the confidence intervals?

I have tried unsing dplyr:

testData %>%
  group_by(cohort) %>%
  group_map(~ calculateCI(.x$value))

but it doesn't work with the vector cohort. How can I pass a vector as the argument for group_by

Also I would like to have the results in a form of a data.table, with a column for lower and upper confidence interval:

group category year sumValue lowerCi upperCi
 1: Group1     cat1 2015 320.9763     xxx     yyy
 2: Group1     cat2 2015 301.5985     xxx     yyy
 3: Group1     cat1 2016 476.4137     xxx     yyy
 4: Group1     cat2 2016 526.7104     xxx     yyy
 5: Group1     cat1 2017 464.4076     xxx     yyy
 6: Group1     cat2 2017 269.7241     xxx     yyy
 7: Group2     cat2 2016 481.1285     xxx     yyy
 8: Group2     cat3 2016 832.1817     xxx     yyy
 9: Group2     cat3 2017 296.6424     xxx     yyy
10: Group2     cat1 2017 168.8109     xxx     yyy

CodePudding user response:

You could calculate the mean and SD by groups:

tapply(testData$value, INDEX = list(testData$group), FUN = mean, na.rm = TRUE) 
tapply(testData$value, INDEX = list(testData$group), FUN = sd, na.rm = TRUE) 

or by more than one factor:

tapply(testData$value, INDEX = list(testData$category, testData$group), FUN = mean, na.rm = TRUE) 

and calculate the CIs

CodePudding user response:

library(dplyr)

testData %>%
group_by(group, category, year) %>%
summarise(across(.cols = value, .fns = c("n" = ~n(), "Mean" = mean, "StDev" = sd), .names = "{.fn}"), .groups = "drop") %>%
mutate("StE" = StDev / sqrt(n)) %>%
mutate("LowerCI95" = Mean - (1.96 * StE),
     "UpperCI95" = Mean   (1.96 * StE))
  • Related