Home > Blockchain >  Calculating statistics based on group_by and then pivoting
Calculating statistics based on group_by and then pivoting

Time:08-03

I have data that is organized by ID value. Each ID value has up to 1-6 rows of data. I want to calculate the total Volume and the average Volume of each ID value, and then force it all into one row.

Here's the data:

> dput(head(COMPLETED_DATASET_allsizes, n = 150))
structure(list(PVC = c("6,056,589", "6,056,589", "6,056,589", 
"6,013,925", "6,013,925", "6,013,925", "6,034,050", "6,034,050", 
"6,034,050", "6070852", "6070852", "6070852", "6070862", "6014535", 
"6014326", "6013176", "6013176", "6013176", "AF6", "AF6", "AF6", 
"6019049", "6019049", "6008710", "6008710", "6013198", "601321_", 
"AF600", "AF600", "AF600", "6020617", "6020617", "6020617", "6008806", 
"6008806", "6021296", "6021296", "6021296", "6014829", "6014829", 
"6013907", "6013907", "6013907", "601404_", "601404_", "6013766", 
"6013766", "6013766", "6034421", "6034421", "6034421", "20F", 
"ZHD", "ZHD", "X11", "X11", "28R", "28R", "37V", "UVX", "37V", 
"HY3", "HY3", "6,014,837", "ZWJ", "ZWJ", "ZWJ", "BX4", "BX4", 
"BX4", "BHD", "BHD", "BHD", "ACX", "ACX", "ACX", "XXS", "XXS", 
"XXS", "6,005,684", "6,005,684", "6,005,684", "BHX", "BHX", "BHX", 
"SP3", "SP3", "SP3", "B1A", "B1A", "B1A", "B5F", "B5F", "B5F", 
"BJN", "BJN", "BJN", "AW5", "AW5", "AW5", "HNU", "HNU", "HNU", 
"BSP", "BSP", "BSP", "FW2", "FW2", "ANB", "ANB", "ANB", "SJM", 
"SJM", "FSR", "FSR", "FSR", "UYF", "UYF", "2HZ", "2HZ", "F1X", 
"F1X", "FFR", "FFR", "FFR", "HX1", "HX1", "SUL", "SUL", "SF0", 
"SF0", "SF0", "SHJ", "SHJ", "SHJ", "HRJ", "HRJ", "AP8", "AP8", 
"J4H", "J4H", "XLV", "XLV", "XLV", "H94", "H94", "S2Y", "S2Y", 
"S2Y", "UU9"), Egg_order = c("1", "2", "3", "1", "2", "3", "1", 
"2", "3", "9", "9", "9", "9", "9", "9", "1", "2", "3", "1", "2", 
"3", "1", "2", "9", "9", "1", "1", "9", "9", "9", "1", "2", "3", 
"9", "9", "9", "9", "9", "1", "2", "9", "9", "9", "1", "2", "9", 
"9", "9", "9", "9", "9", "1", "9", "9", "9", "9", "9", "9", "9", 
"9", "9", "9", "9", "9", "9", "9", "9", "1", "2", "3", "3", "2", 
"1", "9", "9", "3", "9", "3", "9", "1", "2", "3", "9", "9", "9", 
"9", "9", "3", "3", "9", "9", "9", "9", "9", "9", "9", "3", "1", 
"2", "3", "3", "9", "9", "9", "9", "9", "2", "1", "9", "9", "9", 
"1", "2", "9", "9", "3", "9", "9", "1", "2", "2", "1", "9", "9", 
"9", "9", "9", "1", "2", "9", "9", "3", "2", "3", "1", "9", "9", 
"1", "2", "2", "1", "1", "2", "3", "9", "9", "9", "9", "3", "3"
), Volume = c(62.00468911, 63.19034545, 64.16343369, 62.77290704, 
62.64955122, 56.63460056, 49.49860059, 57.88686491, 55.4345925, 
57.89358963, 58.12847589, 51.49938302, 58.49927302, 57.34171155, 
54.93179064, 64.56208923, 62.27948266, 56.38218371, 60.06696521, 
62.21206032, 62.90496759, 59.13580929, 57.13564917, 59.70944308, 
55.19527717, 64.84888743, 67.34476853, 65.0132184, 59.66593229, 
62.01691353, 55.27413286, 54.68206904, 54.3717803, 57.36582528, 
60.15748095, 55.06099115, 59.07830469, 56.82857152, 62.76590186, 
55.36801363, 71.70516332, 65.4342034, 59.83429871, 59.18484744, 
57.34744503, 56.11336458, 65.73279116, 57.95890035, 55.09798577, 
58.94565679, 59.97265877, 58.93118052, 67.62983812, 62.45908065, 
61.78370028, 60.67968894, 61.49240694, 60.85435534, 57.82379732, 
52.64023837, 53.7619782, 65.19952241, 62.84622159, 66.40863935, 
57.62326609, 53.89348305, 47.2993185, 61.48633419, 66.30863861, 
58.42476707, 53.26130145, 63.20292711, 62.49620272, 61.32741787, 
62.72989825, 51.11998856, 62.26934629, 58.56945328, 60.3546269, 
55.01982539, 62.3489285, 53.77593518, 56.49758306, 63.53920939, 
64.94851437, 62.25274976, 64.39375777, 54.22321067, 51.28730416, 
58.31060124, 55.25169993, 58.25732223, 57.18221296, 56.44838126, 
53.77395184, 57.43319075, 52.95671938, 61.9705976, 57.10131146, 
54.57651784, 59.66268577, 62.44376038, 59.27393481, 55.46520431, 
54.98693888, 64.79370574, 55.48500646, 54.60260849, 64.98274004, 
63.29243525, 61.18560498, 26.13699345, 23.31596147, 64.26879934, 
65.30612915, 57.39204739, 57.32271224, 53.93674759, 60.36058271, 
55.62742343, 57.91773772, 61.03729104, 54.03165992, 60.84743965, 
59.86017015, 59.90175385, 56.49922666, 63.50802185, 65.16800489, 
64.94391206, 67.2387067, 59.05588275, 57.91108456, 54.43705009, 
59.39818285, 63.9228549, 56.78342653, 54.58236138, 60.47677836, 
60.82957858, 60.51827256, 58.34245187, 57.28814624, 54.81098523, 
54.26852328, 52.43400015, 57.57770424, 60.09658763, 56.22640525, 
59.19068945), Clutch = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 3L, 
3L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 2L, 2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 
2L, 2L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 3L), Year = c(1996L, 
1996L, 1996L, 1996L, 1996L, 1996L, 1996L, 1996L, 1996L, 1997L, 
1997L, 1997L, 1998L, 1998L, 1998L, 1999L, 1999L, 1999L, 1999L, 
1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 
1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 
1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 1999L, 
1999L, 1999L, 1999L, 1999L, 1999L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 
2001L, 2001L, 2001L, 2001L, 2001L), egg_status = c("complete", 
"complete", "complete", "complete", "complete", "complete", "complete", 
"complete", "complete", "none", "none", "none", "none", "none", 
"none", "complete", "complete", "complete", "complete", "complete", 
"complete", "complete", "complete", "none", "none", "complete", 
"complete", "none", "none", "none", "complete", "complete", "complete", 
"none", "none", "none", "none", "none", "complete", "complete", 
"none", "none", "none", "complete", "complete", "none", "none", 
"none", "none", "none", "none", "complete", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "complete", "complete", "complete", 
"complete", "complete", "complete", "estimated", "estimated", 
"estimated", "estimated", "estimated", "estimated", "complete", 
"complete", "complete", "none", "none", "none", "estimated", 
"estimated", "estimated", "estimated", "estimated", "estimated", 
"none", "none", "none", "estimated", "estimated", "estimated", 
"complete", "complete", "complete", "estimated", "estimated", 
"estimated", "none", "none", "none", "complete", "complete", 
"none", "none", "none", "complete", "complete", "estimated", 
"estimated", "estimated", "none", "none", "complete", "complete", 
"complete", "complete", "none", "none", "none", "none", "none", 
"complete", "complete", "estimated", "estimated", "estimated", 
"complete", "complete", "complete", "none", "none", "complete", 
"complete", "complete", "complete", "complete", "complete", "complete", 
"none", "none", "estimated", "estimated", "estimated", "complete"
), estimated_Egg_order = c("1", "2", "3", "1", "2", "3", "1", 
"2", "3", "9", "9", "9", "9", "9", "9", "1", "2", "3", "1", "2", 
"3", "1", "2", "9", "9", "1", "1", "9", "9", "9", "1", "2", "3", 
"9", "9", "9", "9", "9", "1", "2", "9", "9", "9", "1", "2", "9", 
"9", "9", "9", "9", "9", "1", "9", "9", "9", "9", "9", "9", "9", 
"9", "9", "9", "9", "9", "9", "9", "9", "1", "2", "3", "3", "2", 
"1", "1", "2", "3", "1", "3", "2", "1", "2", "3", "9", "9", "9", 
"1", "2", "3", "3", "1", "2", "9", "9", "9", "1", "2", "3", "1", 
"2", "3", "3", "1", "2", "9", "9", "9", "2", "1", "9", "9", "9", 
"1", "2", "1", "2", "3", "9", "9", "1", "2", "2", "1", "9", "9", 
"9", "9", "9", "1", "2", "1", "2", "3", "2", "3", "1", "9", "9", 
"1", "2", "2", "1", "1", "2", "3", "9", "9", "1", "2", "3", "3"
), Edat = c(7L, 7L, 7L, 9L, 9L, 9L, 4L, 4L, 4L, 6L, 6L, 6L, 7L, 
2L, 1L, 13L, 13L, 13L, 11L, 11L, 11L, 12L, 12L, 14L, 14L, 13L, 
13L, 11L, 11L, 11L, 12L, 12L, 12L, 14L, 14L, 9L, 9L, 9L, 13L, 
13L, 12L, 12L, 12L, 13L, 13L, 12L, 12L, 12L, 9L, 9L, 9L, 4L, 
5L, 5L, 6L, 6L, 4L, 4L, 3L, 7L, 3L, 9L, 9L, 15L, 5L, 5L, 5L, 
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 5L, 5L, 5L, 16L, 
16L, 16L, 13L, 13L, 13L, 8L, 8L, 8L, 13L, 13L, 13L, 9L, 9L, 9L, 
13L, 13L, 13L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 13L, 
13L, 13L, 8L, 8L, 10L, 10L, 10L, 7L, 7L, 4L, 4L, 13L, 13L, 9L, 
9L, 9L, 12L, 12L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 11L, 
11L, 11L, 11L, 6L, 6L, 6L, 10L, 10L, 8L, 8L, 8L, 7L), ID = c(378L, 
378L, 378L, 362L, 362L, 362L, 370L, 370L, 370L, 383L, 383L, 383L, 
434L, 426L, 416L, 511L, 511L, 511L, 499L, 499L, 499L, 459L, 459L, 
458L, 458L, 521L, 532L, 501L, 501L, 501L, 470L, 470L, 470L, 500L, 
500L, 481L, 481L, 481L, 576L, 576L, 554L, 554L, 554L, 565L, 565L, 
543L, 543L, 543L, 492L, 492L, 492L, 695L, 722L, 722L, 706L, 706L, 
800L, 800L, 734L, 700L, 734L, 802L, 802L, 737L, 727L, 727L, 727L, 
771L, 771L, 771L, 763L, 763L, 763L, 742L, 742L, 742L, 715L, 715L, 
715L, 735L, 735L, 735L, 764L, 764L, 764L, 815L, 815L, 815L, 754L, 
754L, 754L, 759L, 759L, 759L, 765L, 765L, 765L, 752L, 752L, 752L, 
791L, 791L, 791L, 769L, 769L, 769L, 785L, 785L, 748L, 748L, 748L, 
814L, 814L, 781L, 781L, 781L, 702L, 702L, 732L, 732L, 774L, 774L, 
777L, 777L, 777L, 799L, 799L, 817L, 817L, 810L, 810L, 810L, 813L, 
813L, 813L, 795L, 795L, 749L, 749L, 803L, 803L, 712L, 712L, 712L, 
787L, 787L, 807L, 807L, 807L, 697L)), row.names = c(NA, 150L), class = "data.frame")

Here is a picture of what I hope it will look like. This photo doesn't have the ID value but image if each row was a PVC value.

enter image description here

Thanks for the help!

CodePudding user response:

A dplyr solution

library(tidyverse)

df %>% 
  group_by(PVC, Year, ID) %>%  
  summarise(totalV = sum(Volume), 
            averageV = mean(Volume, na.rm = TRUE), 
            Clutch = last(Clutch), 
            Edat = last(Edat), .groups = "drop")

# A tibble: 63 x 7
   PVC        Year    ID totalV averageV Clutch  Edat
   <chr>     <int> <int>  <dbl>    <dbl>  <int> <int>
 1 20F        2001   695   58.9     58.9      1     4
 2 28R        2001   800  122.      61.2      2     4
 3 2HZ        2001   732  116.      58.0      2     4
 4 37V        2001   734  112.      55.8      2     3
 5 6,005,684  2001   735  171.      57.0      3    16
 6 6,013,925  1996   362  182.      60.7      3     9
 7 6,014,837  2001   737   66.4     66.4      1    15
 8 6,034,050  1996   370  163.      54.3      3     4
 9 6,056,589  1996   378  189.      63.1      3     7
10 6008710    1999   458  115.      57.5      2    14
# ... with 53 more rows

CodePudding user response:

We can do this in one line with data.table, grouping by PVC and getting either the sum of Volume or Average. I'm assuming for Year, Clutch, and Edat all of the same PVC rows have the same values so we can use max or min there.

library(data.table)
dt <- data.table(COMPLETED_DATASET_allsizes)
dt1 <- dt[,.(totalV = sum(Volume), averageV = mean(Volume), Year = max(Year), Clutch = max(Clutch), Edat = max(Edat)), by = PVC]

> head(dt1)
         PVC    totalV averageV Year Clutch Edat
1: 6,056,589 189.35847 63.11949 1996      3    7
2: 6,013,925 182.05706 60.68569 1996      3    9
3: 6,034,050 162.82006 54.27335 1996      3    4
4:   6070852 167.52145 55.84048 1997      3    6
5:   6070862  58.49927 58.49927 1998      1    7
6:   6014535  57.34171 57.34171 1998      1    2

CodePudding user response:

Dplyr solution. I grouped by ID, PVC, Year, Clutch and Edat Since those were in your new data.frame. I assume ID is the identifier for this combination?

#group by and summarise
dt %>% group_by(ID, PVC, Year, Clutch, Edat) %>% 
  summarise(averageV = mean((Volume)),
            totalV = sum(Volume)) %>% ungroup() |> head()

     ID PVC        Year Clutch  Edat averageV totalV
  <int> <chr>     <int>  <int> <int>    <dbl>  <dbl>
1   362 6,013,925  1996      3     9     60.7  182. 
2   370 6,034,050  1996      3     4     54.3  163. 
3   378 6,056,589  1996      3     7     63.1  189. 
4   383 6070852    1997      3     6     55.8  168. 
5   416 6014326    1998      1     1     54.9   54.9
6   426 6014535    1998      1     2     57.3   57.3
  •  Tags:  
  • r
  • Related