How would you split a column based on name and take the confidence Intervals split by specific names?
Can't Index because importing a file a range, because importing different files. Can't hardcode names
Data Looks like:
Name | Score
Anna 90
Anna 90
Anna 30
Anna 60
Anna 60
Anna 60
Anna 60
Bob 80
Bob 70
Bob 10
Bob 80
Chad 10
Chad 10
Chad 40
Chad 30
Chad 90
How would you take the confidence intervals for
Anna | Bob | Chad
Tried splitting
#df[c('Name')] <- str_split_fixed(df, ' ', 1)
Tried tapply
CodePudding user response:
You were close with tapply
. Either look in the textbook and code a ci
function like this,
ci <- \(x, a=.05) {
p <- sapply(list(length, sd, mean), \(f) f(x))
tt <- -qt(a/2, p[1] - 1)
p[3] tt*(p[2]/sqrt(p[1]))*c(-1, 1)
}
with(dat, tapply(Score, Name, ci)) |> do.call(what=rbind)
# [,1] [,2]
# Anna 45.139588 83.43184
# Bob 6.431446 113.56855
# Chad -4.805243 76.80524
or extract the CIs from t.test
s.
with(dat, tapply(Score, Name, t.test)) |> lapply(`[[`, 'conf.int') |>
rbind.data.frame() |> t()
# [,1] [,2]
# Anna 45.139588 83.43184
# Bob 6.431446 113.56855
# Chad -4.805243 76.80524
Data:
dat <- structure(list(Name = c("Anna", "Anna", "Anna", "Anna", "Anna",
"Anna", "Anna", "Bob", "Bob", "Bob", "Bob", "Chad", "Chad", "Chad",
"Chad", "Chad"), Score = c(90L, 90L, 30L, 60L, 60L, 60L, 60L,
80L, 70L, 10L, 80L, 10L, 10L, 40L, 30L, 90L)), class = "data.frame", row.names = c(NA,
-16L))
CodePudding user response:
Something like this based on split
sapply(split(df, df$Name),
function(x){
mn <- mean(x$Score)
ln <- length(x$Score)
sdd <- sd(x$Score)
sd_er <- sdd / sqrt(ln)
alpha <- 0.05
dof <- ln - 1
t_score <- qt(p=alpha / 2, df=dof, lower.tail=F)
mg_er <- t_score * sd_er
l_bound <- mn - mg_er
u_bound <- mn mg_er
c(lb=l_bound, ub=u_bound)})
Anna Bob Chad
lb 45.13959 6.431446 -4.805243
ub 83.43184 113.568554 76.805243
With dplyr
library(dplyr)
df %>%
group_by(Name) %>%
summarize(mn = mean(Score), ln = length(Score), sdd = sd(Score),
sd_er = sdd / sqrt(ln), alpha = 0.05, dof = ln - 1,
t_score = qt(p=alpha / 2, df=dof, lower.tail=F),
mg_er = t_score * sd_er) %>%
summarize(Name, l_bound = mn - mg_er, u_bound = mn mg_er)
# A tibble: 3 × 3
Name l_bound u_bound
<chr> <dbl> <dbl>
1 Anna 45.1 83.4
2 Bob 6.43 114.
3 Chad -4.81 76.8
Data
df <- structure(list(Name = c("Anna", "Anna", "Anna", "Anna", "Anna",
"Anna", "Anna", "Bob", "Bob", "Bob", "Bob", "Chad", "Chad", "Chad",
"Chad", "Chad"), Score = c(90, 90, 30, 60, 60, 60, 60, 80, 70,
10, 80, 10, 10, 40, 30, 90)), class = "data.frame", row.names = c(NA,
-16L))