Home > database >  How to perform for loop to apply custom function with grouping
How to perform for loop to apply custom function with grouping

Time:07-09

I'm trying to perform a forloop to apply a custom summarise function to all the numeric columns in the dataframe. The forloop output seems to ignore the grouping factor- however, if I perform the function alone on a single column (without the for loop), it provides the correct output.

#sample df
structure(list(participant = c("pt04", "pt75", "pt21", "pt73", 
"pt27", "pt39", "pt43", "pt52", "pt69", "pt49", "pt50", "pt56", 
"pt62", "pt68", "pt22", "pt64", "pt54", "pt79", "pt36", "pt26", 
"pt65", "pt38"), group = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L
), .Label = c("c", "e"), class = "factor"), sex = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 
2L, 1L, 1L, 2L, 1L), .Label = c("m", "f"), class = "factor"), 
    fm_bdc3 = c(18.535199635968, 23.52996574649, 17.276246451976, 
    11.526088555461, 23.805048656112, 23.08597823716, 28.691020942436, 
    28.968097858499, 23.378093165331, 22.491725344661, 14.609015054932, 
    19.734914019306, 31.947412973684, 25.152298171274, 12.007356801787, 
    20.836128108938, 22.322230884349, 14.777652101515, 21.389572717608, 
    16.992853675086, 14.138189878472, 17.777235203826), fm_rec3 = c(18.545007190636, 
    23.017181869742, 17.031403417007, 11.227201061887, 23.581434653208, 
    21.571120542136, 28.919246372213, 28.138632765662, 22.990408911436, 
    22.274932676852, 14.012586350504, 19.066675709151, 30.897705534847, 
    24.491614222412, 11.670939246332, 20.306494543464, 22.052263684182, 
    14.252973638341, 21.028701096846, 17.207104923059, 13.172159777361, 
    17.610831079442), fm_chg = c(0.00980755466799721, -0.512783876747999, 
    -0.244843034968998, -0.298887493573998, -0.223614002904, 
    -1.514857695024, 0.228225429777002, -0.829465092836998, -0.387684253894999, 
    -0.216792667809003, -0.596428704428, -0.668238310155001, 
    -1.049707438837, -0.660683948862001, -0.336417555455, -0.529633565474001, 
    -0.269967200167002, -0.524678463173998, -0.360871620761998, 
    0.214251247972999, -0.966030101111, -0.166404124383998), 
    fm_percchg = c(0.00052913132097943, -0.0217928016671462, 
    -0.0141722355981437, -0.0259313896588437, -0.00939355370091154, 
    -0.0656180855522784, 0.00795459423472242, -0.0286337438132355, 
    -0.0165832282022865, -0.00963877445980213, -0.0408260722701251, 
    -0.0338607155572751, -0.0328573534170568, -0.0262673392452288, 
    -0.028017619615079, -0.025419001203338, -0.0120940958619099, 
    -0.0355048596062299, -0.0168713805332318, 0.0126083147698213, 
    -0.0683277073949869, -0.00936051767758492)), row.names = c(NA, 
-22L), class = "data.frame")

#my function:
 summbygrp <- function(x) {
  group_by(dexadf, group) %>%
        summarise(
          count = n(),
          mean = mean({{x}}, na.rm = TRUE),
          sd = sd({{x}}, na.rm = TRUE)
        ) %>%
        mutate(se = sd / sqrt(11),
               lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
               upper.ci = mean   qt(1 - (0.05 / 2), 11 - 1) * se
              ) 
} 

#apply function to all numeric columns and print column names before output
coln = 1

for (col in dexadf) {
  print(colnames(dexadf)[coln])
  coln = coln   1
  if(is.numeric(col)) {
    print(summbygrp(col))
  } else {next}
}

#output: 
[1] "fm_bdc3"
# A tibble: 2 × 7
  group count  mean    sd    se lower.ci upper.ci
  <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11  20.6  5.48  1.65     16.9     24.3
2 e        11  20.6  5.48  1.65     16.9     24.3
[1] "fm_rec3"
# A tibble: 2 × 7
  group count  mean    sd    se lower.ci upper.ci
  <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11  20.1  5.41  1.63     16.5     23.8
2 e        11  20.1  5.41  1.63     16.5     23.8
[1] "fm_chg"
# A tibble: 2 × 7
  group count   mean    sd    se lower.ci upper.ci
  <fct> <int>  <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 c        11 -0.450 0.406 0.122   -0.723   -0.178
2 e        11 -0.450 0.406 0.122   -0.723   -0.178
[1] "fm_percchg"
# A tibble: 2 × 7
  group count    mean     sd      se lower.ci upper.ci
  <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
1 c        11 -0.0227 0.0198 0.00598  -0.0360 -0.00942
2 e        11 -0.0227 0.0198 0.00598  -0.0360 -0.00942

As you can see, all the means for both groups are the same, and I know this shouldn't be true. Could someone identify the error in the code? Thank you!

CodePudding user response:

So instead of using for-loops you can do better,

library(dplyr)
library(rlang)
library(purrr)
library(tibble)


dexadf <- data.frame(
  stringsAsFactors = FALSE,
  participant = c("pt04","pt75","pt21","pt73",
                  "pt27","pt39","pt43","pt52","pt69","pt49","pt50",
                  "pt56","pt62","pt68","pt22","pt64","pt54","pt79",
                  "pt36","pt26","pt65","pt38"),
  fm_bdc3 = c(18.535199635968,23.52996574649,
              17.276246451976,11.526088555461,23.805048656112,
              23.08597823716,28.691020942436,28.968097858499,
              23.378093165331,22.491725344661,14.609015054932,19.734914019306,
              31.947412973684,25.152298171274,12.007356801787,
              20.836128108938,22.322230884349,14.777652101515,
              21.389572717608,16.992853675086,14.138189878472,17.777235203826),
  fm_rec3 = c(18.545007190636,
              23.017181869742,17.031403417007,11.227201061887,23.581434653208,
              21.571120542136,28.919246372213,28.138632765662,
              22.990408911436,22.274932676852,14.012586350504,19.066675709151,
              30.897705534847,24.491614222412,11.670939246332,
              20.306494543464,22.052263684182,14.252973638341,
              21.028701096846,17.207104923059,13.172159777361,17.610831079442),
  fm_chg = c(0.00980755466799721,
             -0.512783876747999,-0.244843034968998,-0.298887493573998,
             -0.223614002904,-1.514857695024,0.228225429777002,
             -0.829465092836998,-0.387684253894999,-0.216792667809003,
             -0.596428704428,-0.668238310155001,-1.049707438837,
             -0.660683948862001,-0.336417555455,-0.529633565474001,
             -0.269967200167002,-0.524678463173998,-0.360871620761998,
             0.214251247972999,-0.966030101111,-0.166404124383998),
  fm_percchg = c(0.00052913132097943,
                 -0.0217928016671462,-0.0141722355981437,-0.0259313896588437,
                 -0.00939355370091154,-0.0656180855522784,
                 0.00795459423472242,-0.0286337438132355,-0.0165832282022865,
                 -0.00963877445980213,-0.0408260722701251,-0.0338607155572751,
                 -0.0328573534170568,-0.0262673392452288,-0.028017619615079,
                 -0.025419001203338,-0.0120940958619099,
                 -0.0355048596062299,-0.0168713805332318,0.0126083147698213,
                 -0.0683277073949869,-0.00936051767758492),
  group = as.factor(c("c","e",
                      "e","c","c","e","c","e","c","e","e","c",
                      "e","c","c","e","e","c","e","c","e",
                      "c")),
  sex = as.factor(c("f","m",
                    "m","m","m","m","m","f","m","f","f","f",
                    "f","f","f","f","m","f","m","m","f",
                    "m"))
)

dexadf <- as_tibble(dexadf)

# Note the use of .data pronoun, since columns will passed to this function as characters

summbygrp <- function(df, x) {
  df %>% 
    group_by(group) %>%
    summarise(
      count = n(),
      mean = mean(.data[[x]], na.rm = TRUE), # use of .data
      sd = sd(.data[[x]], na.rm = TRUE) # use of .data
    ) %>%
    mutate(se = sd / sqrt(11),
           lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
           upper.ci = mean   qt(1 - (0.05 / 2), 11 - 1) * se
    )
}

# Here we extract the numerical columns of the dataset
cols <- dexadf %>% 
  select(where(is.numeric)) %>% colnames(.)

cols 
#> [1] "fm_bdc3"    "fm_rec3"    "fm_chg"     "fm_percchg"

# Then instead of for loops we can simply use this map function
map(.x = cols, ~ summbygrp(dexadf, .x))

#> [[1]]
#> # A tibble: 2 × 7
#>   group count  mean    sd    se lower.ci upper.ci
#>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 c        11  19.3  5.49  1.66     15.6     23.0
#> 2 e        11  21.9  5.40  1.63     18.2     25.5
#> 
#> [[2]]
#> # A tibble: 2 × 7
#>   group count  mean    sd    se lower.ci upper.ci
#>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 c        11  19.1  5.54  1.67     15.3     22.8
#> 2 e        11  21.2  5.31  1.60     17.7     24.8
#> 
#> [[3]]
#> # A tibble: 2 × 7
#>   group count   mean    sd     se lower.ci upper.ci
#>   <fct> <int>  <dbl> <dbl>  <dbl>    <dbl>    <dbl>
#> 1 c        11 -0.256 0.311 0.0938   -0.465  -0.0470
#> 2 e        11 -0.645 0.407 0.123    -0.918  -0.371 
#> 
#> [[4]]
#> # A tibble: 2 × 7
#>   group count    mean     sd      se lower.ci upper.ci
#>   <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
#> 1 c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
#> 2 e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170

# -------------------------------------------------------------------
# we can also bind all the output results (dataframes) in a single dataframe
map_dfr(.x = cols, ~ summbygrp(dexadf, .x), .id = "vars")

#> # A tibble: 8 × 8
#>   vars  group count    mean     sd      se lower.ci upper.ci
#>   <chr> <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
#> 1 1     c        11 19.3    5.49   1.66     15.6    23.0    
#> 2 1     e        11 21.9    5.40   1.63     18.2    25.5    
#> 3 2     c        11 19.1    5.54   1.67     15.3    22.8    
#> 4 2     e        11 21.2    5.31   1.60     17.7    24.8    
#> 5 3     c        11 -0.256  0.311  0.0938   -0.465  -0.0470 
#> 6 3     e        11 -0.645  0.407  0.123    -0.918  -0.371  
#> 7 4     c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
#> 8 4     e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170

Created on 2022-07-09 by the reprex package (v2.0.1)

CodePudding user response:

out <- df %>%
  pivot_longer(starts_with('fm')) %>%
  group_by(name, group) %>%
  summarise(
    count = n(),
    mean = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    .groups = 'drop'
  ) %>%
  mutate(se = sd / sqrt(11),
         lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
         upper.ci = mean   qt(1 - (0.05 / 2), 11 - 1) * se
  ) 
out

# A tibble: 8 x 8
  name       group count    mean     sd      se lower.ci upper.ci
  <chr>      <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
1 fm_bdc3    c        11 19.3    5.49   1.66     15.6    23.0    
2 fm_bdc3    e        11 21.9    5.40   1.63     18.2    25.5    
3 fm_chg     c        11 -0.256  0.311  0.0938   -0.465  -0.0470 
4 fm_chg     e        11 -0.645  0.407  0.123    -0.918  -0.371  
5 fm_percchg c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
6 fm_percchg e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170 
7 fm_rec3    c        11 19.1    5.54   1.67     15.3    22.8    
8 fm_rec3    e        11 21.2    5.31   1.60     17.7    24.8 

if you need the list, just split it:

split(out, ~name)
$fm_bdc3
# A tibble: 2 x 8
  name    group count  mean    sd    se lower.ci upper.ci
  <chr>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 fm_bdc3 c        11  19.3  5.49  1.66     15.6     23.0
2 fm_bdc3 e        11  21.9  5.40  1.63     18.2     25.5

$fm_chg
# A tibble: 2 x 8
  name   group count   mean    sd     se lower.ci upper.ci
  <chr>  <fct> <int>  <dbl> <dbl>  <dbl>    <dbl>    <dbl>
1 fm_chg c        11 -0.256 0.311 0.0938   -0.465  -0.0470
2 fm_chg e        11 -0.645 0.407 0.123    -0.918  -0.371 

$fm_percchg
# A tibble: 2 x 8
  name       group count    mean     sd      se lower.ci upper.ci
  <chr>      <fct> <int>   <dbl>  <dbl>   <dbl>    <dbl>    <dbl>
1 fm_percchg c        11 -0.0149 0.0167 0.00503  -0.0261 -0.00368
2 fm_percchg e        11 -0.0306 0.0203 0.00611  -0.0442 -0.0170 

$fm_rec3
# A tibble: 2 x 8
  name    group count  mean    sd    se lower.ci upper.ci
  <chr>   <fct> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
1 fm_rec3 c        11  19.1  5.54  1.67     15.3     22.8
2 fm_rec3 e        11  21.2  5.31  1.60     17.7     24.8

CodePudding user response:

A similar answer to the above, but combining across and summarise:

df |> 
  group_by(group) |> 
  summarise(
    across(
      where(is.numeric), 
      list(
        mean = ~mean(.x, na.rm = TRUE), 
        sd = ~sd(.x, na.rm = TRUE), 
        n = ~n()
      ), 
      .names = "{.col}.{.fn}"
    )
  ) |> 
  pivot_longer(
    -group,
    names_to = c("measure", "stat"), 
    names_sep = "\\."
  ) |> 
  pivot_wider(
    names_from = stat,
    values_from = value
  ) |> 
  mutate(
    se = sd / sqrt(n), 
    lower.ci = mean - qt(1 - (0.05 / 2), 11 - 1) * se,
    upper.ci = mean   qt(1 - (0.05 / 2), 11 - 1) * se
  ) |> 
  arrange(measure)
  • Related