I'm trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.
#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73",
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56",
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26",
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"),
fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976,
11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436,
28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932,
19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787,
20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608,
16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636,
23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208,
21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436,
22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847,
24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182,
14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361,
17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999,
-0.244843034968998, -0.298887493573998, -0.223614002904,
-1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999,
-0.216792667809003, -0.596428704428, -0.668238310155001,
-1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001,
-0.269967200167002, -0.524678463173998, -0.360871620761998,
0.214251247972999, -0.966030101111, -0.166404124383998),
fm_percchg = c(0.00052913132097943, -0.0217928016671462,
-0.0141722355981437, -0.0259313896588437, -0.00939355370091154,
-0.0656180855522784, 0.00795459423472242, -0.0286337438132355,
-0.0165832282022865, -0.00963877445980213, -0.0408260722701251,
-0.0338607155572751, -0.0328573534170568, -0.0262673392452288,
-0.028017619615079, -0.025419001203338, -0.0120940958619099,
-0.0355048596062299, -0.0168713805332318, 0.0126083147698213,
-0.0683277073949869, -0.00936051767758492)), row.names = c(NA,
-22L), class = "data.frame")
#my function:
summbygrp <- function(x) {
group_by(dexadf, group) %>%
summarise(
count = n(),
mean = mean({{x}}, na.rm = TRUE),
sd = sd({{x}}, na.rm = TRUE)
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean qt(1 - (0.05 / 2), 11 - 1) * se
)
}
#apply function to all numeric columns and print column names before output
coln = 1
for (col in dexadf) {
print(colnames(dexadf)[coln])
coln = coln 1
if(is.numeric(col)) {
print(summbygrp(col))
} else {next}
}
#output:
[1] "fm_bdc3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.6 5.48 1.65 16.9 24.3
2 e 11 20.6 5.48 1.65 16.9 24.3
[1] "fm_rec3"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 20.1 5.41 1.63 16.5 23.8
2 e 11 20.1 5.41 1.63 16.5 23.8
[1] "fm_chg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.450 0.406 0.122 -0.723 -0.178
2 e 11 -0.450 0.406 0.122 -0.723 -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
group count mean sd se lower.ci upper.ci
<fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 c 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
2 e 11 -0.0227 0.0198 0.00598 -0.0360 -0.00942
As you can see, all the means for both groups are the same, and I know this shouldn't be true. Could someone identify the error in the code? Thank you!
CodePudding user response:
So instead of using for-loops you can do better,
library(dplyr)
library(rlang)
library(purrr)
library(tibble)
dexadf <- data.frame(
stringsAsFactors = FALSE,
participant = c("pt04","pt75","pt21","pt73",
"pt27","pt39","pt43","pt52","pt69","pt49","pt50",
"pt56","pt62","pt68","pt22","pt64","pt54","pt79",
"pt36","pt26","pt65","pt38"),
fm_bdc3 = c(18.535199635968,23.52996574649,
17.276246451976,11.526088555461,23.805048656112,
23.08597823716,28.691020942436,28.968097858499,
23.378093165331,22.491725344661,14.609015054932,19.734914019306,
31.947412973684,25.152298171274,12.007356801787,
20.836128108938,22.322230884349,14.777652101515,
21.389572717608,16.992853675086,14.138189878472,17.777235203826),
fm_rec3 = c(18.545007190636,
23.017181869742,17.031403417007,11.227201061887,23.581434653208,
21.571120542136,28.919246372213,28.138632765662,
22.990408911436,22.274932676852,14.012586350504,19.066675709151,
30.897705534847,24.491614222412,11.670939246332,
20.306494543464,22.052263684182,14.252973638341,
21.028701096846,17.207104923059,13.172159777361,17.610831079442),
fm_chg = c(0.00980755466799721,
-0.512783876747999,-0.244843034968998,-0.298887493573998,
-0.223614002904,-1.514857695024,0.228225429777002,
-0.829465092836998,-0.387684253894999,-0.216792667809003,
-0.596428704428,-0.668238310155001,-1.049707438837,
-0.660683948862001,-0.336417555455,-0.529633565474001,
-0.269967200167002,-0.524678463173998,-0.360871620761998,
0.214251247972999,-0.966030101111,-0.166404124383998),
fm_percchg = c(0.00052913132097943,
-0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
-0.00939355370091154,-0.0656180855522784,
0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
-0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
-0.0328573534170568,-0.0262673392452288,-0.028017619615079,
-0.025419001203338,-0.0120940958619099,
-0.0355048596062299,-0.0168713805332318,0.0126083147698213,
-0.0683277073949869,-0.00936051767758492),
group = as.factor(c("c","e",
"e","c","c","e","c","e","c","e","e","c",
"e","c","c","e","e","c","e","c","e",
"c")),
sex = as.factor(c("f","m",
"m","m","m","m","m","f","m","f","f","f",
"f","f","f","f","m","f","m","m","f",
"m"))
)
dexadf <- as_tibble(dexadf)
# Note the use of .data pronoun, since columns will passed to this function as characters
summbygrp <- function(df, x) {
df %>%
group_by(group) %>%
summarise(
count = n(),
mean = mean(.data[[x]], na.rm = TRUE), # use of .data
sd = sd(.data[[x]], na.rm = TRUE) # use of .data
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean qt(1 - (0.05 / 2), 11 - 1) * se
)
}
# Here we extract the numerical columns of the dataset
cols <- dexadf %>%
select(where(is.numeric)) %>% colnames(.)
cols
#> [1] "fm_bdc3" "fm_rec3" "fm_chg" "fm_percchg"
# Then instead of for loops we can simply use this map function
map(.x = cols, ~ summbygrp(dexadf, .x))
#> [[1]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 e 11 21.9 5.40 1.63 18.2 25.5
#>
#> [[2]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 19.1 5.54 1.67 15.3 22.8
#> 2 e 11 21.2 5.31 1.60 17.7 24.8
#>
#> [[3]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 2 e 11 -0.645 0.407 0.123 -0.918 -0.371
#>
#> [[4]]
#> # A tibble: 2 × 7
#> group count mean sd se lower.ci upper.ci
#> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 2 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
# -------------------------------------------------------------------
# we can also bind all the output results (dataframes) in a single dataframe
map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")
#> # A tibble: 8 × 8
#> vars group count mean sd se lower.ci upper.ci
#> <chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 c 11 19.3 5.49 1.66 15.6 23.0
#> 2 1 e 11 21.9 5.40 1.63 18.2 25.5
#> 3 2 c 11 19.1 5.54 1.67 15.3 22.8
#> 4 2 e 11 21.2 5.31 1.60 17.7 24.8
#> 5 3 c 11 -0.256 0.311 0.0938 -0.465 -0.0470
#> 6 3 e 11 -0.645 0.407 0.123 -0.918 -0.371
#> 7 4 c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
#> 8 4 e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
Created on 2022-07-09 by the reprex package (v2.0.1)
CodePudding user response:
out <- df %>%
pivot_longer(starts_with('fm')) %>%
group_by(name, group) %>%
summarise(
count = n(),
mean = mean(value, na.rm = TRUE),
sd = sd(value, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(se = sd / sqrt(11),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean qt(1 - (0.05 / 2), 11 - 1) * se
)
out
# A tibble: 8 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
3 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
4 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
5 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
6 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
7 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
8 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
if you need the list, just split it:
split(out, ~name)
$fm_bdc3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_bdc3 c 11 19.3 5.49 1.66 15.6 23.0
2 fm_bdc3 e 11 21.9 5.40 1.63 18.2 25.5
$fm_chg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_chg c 11 -0.256 0.311 0.0938 -0.465 -0.0470
2 fm_chg e 11 -0.645 0.407 0.123 -0.918 -0.371
$fm_percchg
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_percchg c 11 -0.0149 0.0167 0.00503 -0.0261 -0.00368
2 fm_percchg e 11 -0.0306 0.0203 0.00611 -0.0442 -0.0170
$fm_rec3
# A tibble: 2 x 8
name group count mean sd se lower.ci upper.ci
<chr> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 fm_rec3 c 11 19.1 5.54 1.67 15.3 22.8
2 fm_rec3 e 11 21.2 5.31 1.60 17.7 24.8
CodePudding user response:
A similar answer to the above, but combining across
and summarise
:
df |>
group_by(group) |>
summarise(
across(
where(is.numeric),
list(
mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE),
n = ~n()
),
.names = "{.col}.{.fn}"
)
) |>
pivot_longer(
-group,
names_to = c("measure", "stat"),
names_sep = "\\."
) |>
pivot_wider(
names_from = stat,
values_from = value
) |>
mutate(
se = sd / sqrt(n),
lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
upper.ci = mean qt(1 - (0.05 / 2), 11 - 1) * se
) |>
arrange(measure)