Home > front end >  How to calculate means and standard deviations for multiple grouped variables?
How to calculate means and standard deviations for multiple grouped variables?

Time:03-25

I have a dataframe with multiple variables that could be grouped in terms of what they measure (e.g., temperature, humidity, precipitation etc.) but with measurements taken at different points (e.g., P1, P2, P3, P4). For each variable, I want to calculate the mean and SD for each site (i.e., across the points).

I've produced a reproducible example below:

# Reproducible example

# Generate random site IDs
digits <- 0:9
createRandString<- function() {
  v <- c(sample(LETTERS, 5, replace = TRUE),
        sample(digits, 4, replace = TRUE),
        sample(LETTERS, 1, replace = TRUE))
  return(paste0(v,collapse = ""))
}
SiteID <- rep(createRandString(), 10)
  
# Generate random data for 12 variables 
# 3 groups of variables beginning with the same string
# The end of the string indicates the point of sampling and varies from 1-3 for each group
AirTempP1 <- runif(10, min=10, max=30)
AirTempP2 <- runif(10, min=10, max=30)
AirTempP3 <- runif(10, min=10, max=30)
AirHumidityP1 <- runif(10, min=10, max=100)
AirHumidityP2 <- runif(10, min=10, max=100)
AirHumidityP3 <- runif(10, min=10, max=100)
PrecipitationP1 <- runif(10, min=1, max=100)
PrecipitationP2 <- runif(10, min=1, max=100)
PrecipitationP3 <- runif(10, min=1, max=100) 

# Collate variables into a dataframe
df <- data.frame(SiteID, AirTempP1, AirTempP2, AirTempP3,
                 AirHumidityP1, AirHumidityP2, AirHumidityP3,
                 PrecipitationP1, PrecipitationP2, PrecipitationP3)

The actual dataframe has many more variables and sampling points but they all follow a similar pattern, so I'd like to create a code that can group similar variables (e.g., those that start with the same string) automatically.

And an expected outcome here:

> print(ExpectedDf)
       SiteID AirTempP1 AirTempP2 AirTempP3 AirHumidityP1 AirHumidityP2 AirHumidityP3 PrecipitationP1
1  IGHQY0356F  28.02461  28.88378  27.03425      62.24659      85.55429      80.88448        9.637548
2  IGHQY0356F  21.70965  23.23151  12.69549      76.20806      63.19107      50.79254       23.059624
3  IGHQY0356F  23.86685  20.06503  20.89402      81.24686      17.64601      70.33094       32.757375
4  IGHQY0356F  11.70800  25.94723  25.08843      60.57461      57.82563      81.31534       17.261870
5  IGHQY0356F  17.16265  21.84697  13.96995      44.51160      33.46379      93.39282       25.930814
6  IGHQY0356F  19.70643  26.95216  19.00345      39.35188      72.06986      35.35161       54.050623
7  IGHQY0356F  29.26167  14.66030  18.23532      44.68163      97.37606      65.56126       35.184882
8  IGHQY0356F  20.65157  22.62985  24.78574      14.31477      75.90861      79.04664       41.442555
9  IGHQY0356F  23.43913  17.53188  16.15400      12.93507      93.61767      22.52553       27.179809
10 IGHQY0356F  27.50141  15.79779  28.74214      89.08728      90.05281      66.03134       51.503836
   PrecipitationP2 PrecipitationP3 AirTempMean AirTempSD AirHumidityMean AirHumiditySD PrecipitationMean
1        30.533399        22.23755    27.98088 0.9255407        76.22845      20.80283          20.80283
2        98.275429        74.97568    19.21222 5.6947159        63.39722      65.43691          65.43691
3        83.541316        75.37026    21.60864 1.9991155        56.40793      63.88965          63.88965
4        67.788465        72.12119    20.91455 7.9846636        66.57186      52.39051          52.39051
5        41.544996        34.19381    17.65986 3.9619820        57.12274      33.88987          33.88987
6         5.002055        51.37920    21.88735 4.4003187        48.92445      36.81063          36.81063
7        12.638292        52.33821    20.71910 7.6109679        69.20632      33.38713          33.38713
8        37.117511        73.60906    22.68905 2.0677247        56.42334      50.72304          50.72304
9        27.691037        66.32341    19.04167 3.8701284        43.02609      40.39809          40.39809
10       59.951075        95.64219    24.01378 7.1422502        81.72381      69.03237          69.03237
   PrecipitationSD
1        10.521548
2        38.504486
3        27.269130
4        30.499330
5         7.811527
6        27.579395
7        19.910924
8        19.937499
9        22.453448
10       23.428636

CodePudding user response:

If it is only three sets of columns, can use across with rowMeans or rowSds

library(dplyr)
library(matrixStats)
df %>%
    mutate(AirTempMean = rowMeans(across(starts_with("AirTemp")), na.rm = TRUE), 
    AirTempSD = rowSds(as.matrix(across(starts_with("AirTemp")))),
    AirHumidityMean = rowMeans(across(starts_with("AirHumidity")), na.rm = TRUE), 
    AirHumiditySD = rowSds(as.matrix(across(starts_with("AirHumidity")))),
    PrecipitationMean = rowMeans(across(starts_with("Precipitation")), na.rm = TRUE), 
    PrecipitationSD = rowSds(as.matrix(across(starts_with("Precipitation"))))

 )

If there are many sets of columns, an option is to reshape to 'long' format and then do the mean/sd by row and then bind with the original dataset

library(tidyr)
df %>%
    select(-SiteID) %>% 
    mutate(rn = row_number()) %>%
    pivot_longer(cols= -rn, names_to = c(".value", "pval"), 
       names_pattern = "(.*)(P\\d $)") %>%
    group_by(rn) %>% 
    summarise(across(where(is.numeric),
       list(Mean = ~ mean(., na.rm = TRUE), 
         SD = ~ sd(., na.rm = TRUE)))) %>%
    select(-rn) %>% 
    bind_cols(df, .)

-output

  SiteID AirTempP1 AirTempP2 AirTempP3 AirHumidityP1 AirHumidityP2 AirHumidityP3 PrecipitationP1 PrecipitationP2
1  KIIXB6808G  21.73691  24.96523  10.93523      12.84111      92.93506      82.79740        64.92664       70.736212
2  KIIXB6808G  29.42160  25.14421  16.28761      45.63732      56.82373      78.49595        92.42202       58.547199
3  KIIXB6808G  14.43153  17.56969  13.03869      33.29144      90.66400      43.75959        32.32953       96.171349
4  KIIXB6808G  24.35017  17.85779  11.20442      93.95450      18.58702      39.93221        26.69628       81.723180
5  KIIXB6808G  14.80084  29.38776  29.19315      94.70336      95.89065      25.89645        26.95639       28.048125
6  KIIXB6808G  27.88281  14.29717  10.24926      54.96979      83.53267      78.09418        76.69248       72.712109
7  KIIXB6808G  12.77562  22.11161  28.05708      33.02382      54.44677      20.95251        72.94213       93.959692
8  KIIXB6808G  14.85165  20.22299  10.78721      66.59833      31.77392      26.85253        95.13469       54.235009
9  KIIXB6808G  14.42898  27.83384  17.09562      53.95661      52.25697      71.31224        85.97124       23.399866
10 KIIXB6808G  12.87398  18.36380  20.59257      12.67498      53.06563      17.63772        50.60992        6.751882
   PrecipitationP3 AirTemp_Mean AirTemp_SD AirHumidity_Mean AirHumidity_SD Precipitation_Mean Precipitation_SD
1         88.71080     19.21246   7.347780         62.85786       43.61134           74.79122         12.39975
2         81.26882     23.61781   6.698725         60.31900       16.70584           77.41268         17.26350
3         83.31252     15.01330   2.320849         55.90501       30.55382           70.60447         33.76486
4         31.51854     17.80413   6.573037         50.82457       38.84645           46.64600         30.47327
5         60.96926     24.46059   8.366151         72.16349       40.07283           38.65792         19.32989
6         42.94284     17.47641   9.236680         72.19888       15.16659           64.11581         18.44402
7         19.50466     20.98143   7.703164         36.14103       16.96332           62.13549         38.38587
8         34.00365     15.28728   4.732951         41.74159       21.66675           61.12445         31.14241
9         52.38518     19.78615   7.095897         59.17527       10.54522           53.91876         31.31386
10        91.18074     17.27678   3.972451         27.79278       22.02714           49.51418         42.22509
  •  Tags:  
  • r
  • Related