I have a dataframe with multiple variables that could be grouped in terms of what they measure (e.g., temperature, humidity, precipitation etc.) but with measurements taken at different points (e.g., P1, P2, P3, P4). For each variable, I want to calculate the mean and SD for each site (i.e., across the points).
I've produced a reproducible example below:
# Reproducible example
# Generate random site IDs
digits <- 0:9
createRandString<- function() {
v <- c(sample(LETTERS, 5, replace = TRUE),
sample(digits, 4, replace = TRUE),
sample(LETTERS, 1, replace = TRUE))
return(paste0(v,collapse = ""))
}
SiteID <- rep(createRandString(), 10)
# Generate random data for 12 variables
# 3 groups of variables beginning with the same string
# The end of the string indicates the point of sampling and varies from 1-3 for each group
AirTempP1 <- runif(10, min=10, max=30)
AirTempP2 <- runif(10, min=10, max=30)
AirTempP3 <- runif(10, min=10, max=30)
AirHumidityP1 <- runif(10, min=10, max=100)
AirHumidityP2 <- runif(10, min=10, max=100)
AirHumidityP3 <- runif(10, min=10, max=100)
PrecipitationP1 <- runif(10, min=1, max=100)
PrecipitationP2 <- runif(10, min=1, max=100)
PrecipitationP3 <- runif(10, min=1, max=100)
# Collate variables into a dataframe
df <- data.frame(SiteID, AirTempP1, AirTempP2, AirTempP3,
AirHumidityP1, AirHumidityP2, AirHumidityP3,
PrecipitationP1, PrecipitationP2, PrecipitationP3)
The actual dataframe has many more variables and sampling points but they all follow a similar pattern, so I'd like to create a code that can group similar variables (e.g., those that start with the same string) automatically.
And an expected outcome here:
> print(ExpectedDf)
SiteID AirTempP1 AirTempP2 AirTempP3 AirHumidityP1 AirHumidityP2 AirHumidityP3 PrecipitationP1
1 IGHQY0356F 28.02461 28.88378 27.03425 62.24659 85.55429 80.88448 9.637548
2 IGHQY0356F 21.70965 23.23151 12.69549 76.20806 63.19107 50.79254 23.059624
3 IGHQY0356F 23.86685 20.06503 20.89402 81.24686 17.64601 70.33094 32.757375
4 IGHQY0356F 11.70800 25.94723 25.08843 60.57461 57.82563 81.31534 17.261870
5 IGHQY0356F 17.16265 21.84697 13.96995 44.51160 33.46379 93.39282 25.930814
6 IGHQY0356F 19.70643 26.95216 19.00345 39.35188 72.06986 35.35161 54.050623
7 IGHQY0356F 29.26167 14.66030 18.23532 44.68163 97.37606 65.56126 35.184882
8 IGHQY0356F 20.65157 22.62985 24.78574 14.31477 75.90861 79.04664 41.442555
9 IGHQY0356F 23.43913 17.53188 16.15400 12.93507 93.61767 22.52553 27.179809
10 IGHQY0356F 27.50141 15.79779 28.74214 89.08728 90.05281 66.03134 51.503836
PrecipitationP2 PrecipitationP3 AirTempMean AirTempSD AirHumidityMean AirHumiditySD PrecipitationMean
1 30.533399 22.23755 27.98088 0.9255407 76.22845 20.80283 20.80283
2 98.275429 74.97568 19.21222 5.6947159 63.39722 65.43691 65.43691
3 83.541316 75.37026 21.60864 1.9991155 56.40793 63.88965 63.88965
4 67.788465 72.12119 20.91455 7.9846636 66.57186 52.39051 52.39051
5 41.544996 34.19381 17.65986 3.9619820 57.12274 33.88987 33.88987
6 5.002055 51.37920 21.88735 4.4003187 48.92445 36.81063 36.81063
7 12.638292 52.33821 20.71910 7.6109679 69.20632 33.38713 33.38713
8 37.117511 73.60906 22.68905 2.0677247 56.42334 50.72304 50.72304
9 27.691037 66.32341 19.04167 3.8701284 43.02609 40.39809 40.39809
10 59.951075 95.64219 24.01378 7.1422502 81.72381 69.03237 69.03237
PrecipitationSD
1 10.521548
2 38.504486
3 27.269130
4 30.499330
5 7.811527
6 27.579395
7 19.910924
8 19.937499
9 22.453448
10 23.428636
CodePudding user response:
If it is only three sets of columns, can use across
with rowMeans
or rowSds
library(dplyr)
library(matrixStats)
df %>%
mutate(AirTempMean = rowMeans(across(starts_with("AirTemp")), na.rm = TRUE),
AirTempSD = rowSds(as.matrix(across(starts_with("AirTemp")))),
AirHumidityMean = rowMeans(across(starts_with("AirHumidity")), na.rm = TRUE),
AirHumiditySD = rowSds(as.matrix(across(starts_with("AirHumidity")))),
PrecipitationMean = rowMeans(across(starts_with("Precipitation")), na.rm = TRUE),
PrecipitationSD = rowSds(as.matrix(across(starts_with("Precipitation"))))
)
If there are many sets of columns, an option is to reshape to 'long' format and then do the mean/sd
by row
and then bind with the original dataset
library(tidyr)
df %>%
select(-SiteID) %>%
mutate(rn = row_number()) %>%
pivot_longer(cols= -rn, names_to = c(".value", "pval"),
names_pattern = "(.*)(P\\d $)") %>%
group_by(rn) %>%
summarise(across(where(is.numeric),
list(Mean = ~ mean(., na.rm = TRUE),
SD = ~ sd(., na.rm = TRUE)))) %>%
select(-rn) %>%
bind_cols(df, .)
-output
SiteID AirTempP1 AirTempP2 AirTempP3 AirHumidityP1 AirHumidityP2 AirHumidityP3 PrecipitationP1 PrecipitationP2
1 KIIXB6808G 21.73691 24.96523 10.93523 12.84111 92.93506 82.79740 64.92664 70.736212
2 KIIXB6808G 29.42160 25.14421 16.28761 45.63732 56.82373 78.49595 92.42202 58.547199
3 KIIXB6808G 14.43153 17.56969 13.03869 33.29144 90.66400 43.75959 32.32953 96.171349
4 KIIXB6808G 24.35017 17.85779 11.20442 93.95450 18.58702 39.93221 26.69628 81.723180
5 KIIXB6808G 14.80084 29.38776 29.19315 94.70336 95.89065 25.89645 26.95639 28.048125
6 KIIXB6808G 27.88281 14.29717 10.24926 54.96979 83.53267 78.09418 76.69248 72.712109
7 KIIXB6808G 12.77562 22.11161 28.05708 33.02382 54.44677 20.95251 72.94213 93.959692
8 KIIXB6808G 14.85165 20.22299 10.78721 66.59833 31.77392 26.85253 95.13469 54.235009
9 KIIXB6808G 14.42898 27.83384 17.09562 53.95661 52.25697 71.31224 85.97124 23.399866
10 KIIXB6808G 12.87398 18.36380 20.59257 12.67498 53.06563 17.63772 50.60992 6.751882
PrecipitationP3 AirTemp_Mean AirTemp_SD AirHumidity_Mean AirHumidity_SD Precipitation_Mean Precipitation_SD
1 88.71080 19.21246 7.347780 62.85786 43.61134 74.79122 12.39975
2 81.26882 23.61781 6.698725 60.31900 16.70584 77.41268 17.26350
3 83.31252 15.01330 2.320849 55.90501 30.55382 70.60447 33.76486
4 31.51854 17.80413 6.573037 50.82457 38.84645 46.64600 30.47327
5 60.96926 24.46059 8.366151 72.16349 40.07283 38.65792 19.32989
6 42.94284 17.47641 9.236680 72.19888 15.16659 64.11581 18.44402
7 19.50466 20.98143 7.703164 36.14103 16.96332 62.13549 38.38587
8 34.00365 15.28728 4.732951 41.74159 21.66675 61.12445 31.14241
9 52.38518 19.78615 7.095897 59.17527 10.54522 53.91876 31.31386
10 91.18074 17.27678 3.972451 27.79278 22.02714 49.51418 42.22509