Home > OS >  R:dplyr summarise data by group with nth() call with variable n calculated during aggregation
R:dplyr summarise data by group with nth() call with variable n calculated during aggregation

Time:11-11

I'm aggregating data with variable bin sizes (see previous question here: R: aggregate every n rows with variable n depending on sum(n) of second column). In addition to calculating sums and means over groups of variable ranges, I need to pull out single-value covariates at the midpoint of each group range. When I try to do this on the fly, I only get a value for the first group and NAs for the remaining.

df.summary<-as.data.frame(df %>% 
  mutate(rn = row_number()) %>%
  group_by(grp = (cumsum(d)-1)%/% 100   1) %>%
  summarise(x=mean(x, na.rm = TRUE), d=sum(d, na.rm=T), ,i.start=first(rn), i.end=last(rn), y=nth(y, round(first(rn) (last(rn)-first(rn))/2-1))))

head(df.summary)
  grp          x         d i.start i.end        y
1   1 0.07458317  88.99342       1     4 19.78992
2   2 0.07594546  97.62130       5     8       NA
3   3 0.05353308 104.69683       9    12       NA
4   4 0.06498291 106.23468      13    16       NA
5   5 0.08601759  98.24939      17    20       NA
6   6 0.06262427  84.43745      21    23       NA

sample data:

structure(list(x = c(0.10000112377193, 0.110742170350877, 0.0300274304561404, 
0.0575619395964912, 0.109060465438596, 0.0595491225614035, 0.0539270264912281, 
0.0812452063859649, 0.0341699389122807, 0.0391744879122807, 0.0411787485614035, 
0.0996091644385965, 0.0970479474912281, 0.0595715843684211, 0.0483489989122807, 
0.0549631194561404, 0.0705080555964912, 0.080437472631579, 0.105883664631579, 
0.0872411613684211, 0.103236660631579, 0.0381296894912281, 0.0465064491578947, 
0.0936565184561403, 0.0410095752631579, 0.0311180032105263, 0.0257758157894737, 
0.0354721928947368, 0.0584999394736842, 0.0241286060175439, 0.112053376666667, 
0.0769823868596491, 0.0558137530526316, 0.0374491000701754, 0.0419279142631579, 
0.0260257506842105, 0.0544360374561404, 0.107411071842105, 0.103873468, 
0.0419322114035088, 0.0483912961052632, 0.0328373653157895, 0.0866868717719298, 
0.063990467245614, 0.0799280314035088, 0.123490407070175, 0.145676836280702, 
0.0292878782807018, 0.0432093036666667, 0.0203547443684211), 
    d = c(22.2483512600033, 22.2483529247042, 22.2483545865809, 
    22.2483562542823, 22.24835791863, 25.1243105415557, 25.1243148759953, 
    25.1243192107884, 25.1243235416981, 25.1243278750792, 27.2240858553058, 
    27.2240943134697, 27.2241027638674, 27.224111222031, 27.2241196741942, 
    24.5623431981188, 24.5623453409221, 24.5623474809012, 24.562349626705, 
    24.5623517696847, 28.1458125837154, 28.1458157376341, 28.1458188889053, 
    28.1458220452951, 28.1458251983314, 27.8293318542146, 27.8293366652115, 
    27.8293414829159, 27.829346292148, 27.8293511094993, 27.5271773325046, 
    27.5271834011289, 27.5271894694002, 27.5271955369655, 27.5272016048837, 
    28.0376097925214, 28.0376146410729, 28.0376194959786, 28.0376243427651, 
    28.0376291969647, 26.8766095768196, 26.8766122563318, 26.8766149309023, 
    26.8766176123562, 26.8766202925746, 27.8736950101666, 27.8736960528853, 
    27.8736971017815, 27.8736981446767, 27.8736991932199), y = c(19.79001, 
    19.789922, 19.789834, 19.789746, 19.789658, 19.78957, 19.789468, 
    19.789366, 19.789264, 19.789162, 19.78906, 19.78896, 19.78886, 
    19.78876, 19.78866, 19.78856, 19.788458, 19.788356, 19.788254, 
    19.788152, 19.78805, 19.787948, 19.787846, 19.787744, 19.787642, 
    19.78754, 19.787442, 19.787344, 19.787246, 19.787148, 19.78705, 
    19.786956, 19.786862, 19.786768, 19.786674, 19.78658, 19.786486, 
    19.786392, 19.786298, 19.786204, 19.78611, 19.786016, 19.785922, 
    19.785828, 19.785734, 19.78564, 19.785544, 19.785448, 19.785352, 
    19.785256)), row.names = c(NA, 50L), class = "data.frame")

CodePudding user response:

Let's add variable z and n in summarise part. Those variables are defined as below.

df %>% 
  mutate(rn = row_number()) %>%
  group_by(grp = (cumsum(d)-1)%/% 100   1) %>%
  summarise(x=mean(x, na.rm = TRUE), 
            d=sum(d, na.rm=T), ,i.start=first(rn), 
            i.end=last(rn), 
            z = round(first(rn) (last(rn)-first(rn))/2-1),
            n = n())

     grp      x     d i.start i.end     z     n
   <dbl>  <dbl> <dbl>   <int> <int> <dbl> <int>
 1     1 0.0746  89.0       1     4     2     4
 2     2 0.0759  97.6       5     8     6     4
 3     3 0.0535 105.        9    12    10     4
 4     4 0.0650 106.       13    16    14     4
 5     5 0.0860  98.2      17    20    18     4
 6     6 0.0626  84.4      21    23    21     3
 7     7 0.0479 112.       24    27    24     4
 8     8 0.0394  83.5      28    30    28     3
 9     9 0.0706 110.       31    34    32     4
10    10 0.0575 112.       35    38    36     4
11    11 0.0647  83.0      39    41    39     3
12    12 0.0659 108.       42    45    42     4
13    13 0.0854 111.       46    49    46     4
14    14 0.0204  27.9      50    50    49     1

In dataframe above, n indicates sample size of each groups separated by grp. However, as you state group_by(grp), when you call nth(y, z), YOU WILL CALL Z-TH VALUE BY GROUP.

It means that for 5th group, although there exists only 4 values, you call 18th value of y. So it prints NA.

To get this easy, the most simple way I think is use n().

df %>% 
  mutate(rn = row_number()) %>%
  group_by(grp = (cumsum(d)-1)%/% 100   1) %>%
  summarise(x=mean(x, na.rm = TRUE), 
            d=sum(d, na.rm=T), ,i.start=first(rn), 
            i.end=last(rn), 
            y=nth(y, round(n()/2)))

     grp      x     d i.start i.end     y
   <dbl>  <dbl> <dbl>   <int> <int> <dbl>
 1     1 0.0746  89.0       1     4  19.8
 2     2 0.0759  97.6       5     8  19.8
 3     3 0.0535 105.        9    12  19.8
 4     4 0.0650 106.       13    16  19.8
 5     5 0.0860  98.2      17    20  19.8
 6     6 0.0626  84.4      21    23  19.8
 7     7 0.0479 112.       24    27  19.8
 8     8 0.0394  83.5      28    30  19.8
 9     9 0.0706 110.       31    34  19.8
10    10 0.0575 112.       35    38  19.8
11    11 0.0647  83.0      39    41  19.8
12    12 0.0659 108.       42    45  19.8
13    13 0.0854 111.       46    49  19.8
14    14 0.0204  27.9      50    50  NA  

You'll call floor(n/2)th y, which means y that locates middle of each group. Note that you can also try floor(n/2) 1.

CodePudding user response:

df %>% 
 mutate(rn = row_number()) %>%
 group_by(grp = (cumsum(d)-1)%/% 100   1) %>%
 summarise(x=mean(x, na.rm = TRUE),
           d = sum(d, na.rm=T),
           i.start=first(rn), 
           i.end=last(rn), 
           y = nth(y, floor(median(rn)) - i.start))
  • Related