How to count average from a certain range between two specific values?-CodePudding

I have a list of data frames and would like to count average and sd from a certain range between two values. For example:

          t     value                                                         Z
0  27.21666  0.473294                                                      <NA>
1  27.31775  0.484845 {"type":"M","msg":"FIRST","time":27.2498,"dist":0.410454}
2  27.41881  0.457187                                                      <NA>
3  27.51916  0.429909                                                      <NA>
4  27.62018  0.401957                                                      <NA>
5  27.72040  0.373661                                                      <NA>
6  27.82057  0.346553                                                      <NA>
7  27.92117  0.319149                                                      <NA>
8  28.02310  0.291235                                                      <NA>
9  28.12421  0.264531                                                      <NA>
10 28.22576  0.237448                                                      <NA>
11 28.32797  0.210861                                                      <NA>
12 28.42993  0.183152                                                      <NA>
13 28.53069  0.156354                                                      <NA>
14 28.63218  0.128166                                                      <NA>
15 28.73285  0.100545                                                      <NA>
16 28.83326  0.072625                                                      <NA>
17 28.93670  0.042560                                                      <NA>
18 29.03824  0.012583                                                      <NA>
19 29.13856 -0.018004                                                      <NA>
20 29.23870 -0.049970                                                      <NA>
21 29.33917 -0.081763                                                      <NA>
22 29.44034 -0.113793                                                      <NA>
23 29.54193 -0.148269                                                      <NA>
24 29.64338 -0.182154                                                      <NA>
25 29.74523 -0.216842                                                      <NA>
26 29.84533 -0.252688                                                      <NA>
27 29.94582 -0.289105                                                      <NA>
28 30.04596 -0.325591                                                      <NA>
29 30.14973 -0.365179                                                      <NA>
30 30.24985 -0.403439                                                      <NA>
31 30.35072 -0.444497                                                      <NA>
32 30.45133 -0.485201                                                      <NA>
33 30.55292 -0.528480                                                      <NA>
34 30.65318 -0.572731                                                      <NA>
35 30.75435 -0.619512                                                      <NA>
36 30.85619 -0.667557                                                      <NA>
37 30.95747 -0.717658                                                      <NA>
38 31.06031 -0.769489                                                      <NA>
39 31.16094 -0.823372                                                      <NA>
40 31.26099 -0.876394 {"type":"M","msg":"LAST0","time":31.2458,"dist":0.521487}
41 31.36135 -0.932901                                                      <NA>
42 31.46322 -0.991948                                                      <NA>
43 31.56358 -1.049800                                                      <NA>
44 31.66395 -1.109068                                                      <NA>
45 31.76431 -1.166309                                                      <NA>
46 31.86441 -1.219613                                                      <NA>
47 31.96495 -1.266242                                                      <NA>
48 32.06650 -1.304891                                                      <NA>
49 32.16776 -1.331460                                                      <NA>
50 32.26796 -1.344302                                                      <NA>

I'd like to get the average and sd of the column "value" from all values between the start point (= the point in which the Z column has a value of "FIRST" included) and the endpoint (= the point in which the Z column has a value of "LAST0" included). How could I do this in R? Also, how could I just simply delete/filter out all the rows with "NA" before the value "FIRST" and after the value "LAST0" in the Z column?

As a bonus, how could I do this for all different instances if there were multiple similar sequences of values "FIRST" and "LAST0" introduced in one data frame..?

EDIT: Added a dput() below

> dput (tbl[[1]])
structure(list(t = c(27.216666, 27.317755, 27.418812, 27.519156, 27.620184, 
27.720402, 27.820574, 27.921171, 28.023102, 28.124214, 28.225763, 
28.327967, 28.429934, 28.530693, 28.632181, 28.732851, 28.83326, 
28.936697, 29.038239, 29.138559, 29.238697, 29.339165, 29.440336, 
29.541931, 29.643377, 29.745228, 29.845327, 29.945824, 30.045958, 
30.149729, 30.249853, 30.350716, 30.451326, 30.552917, 30.653179, 
30.754349, 30.856194, 30.957468, 31.06031, 31.16094, 31.260986, 
31.361355, 31.463219, 31.563583, 31.663948, 31.764309, 31.864408, 
31.964945, 32.066498, 32.167763, 32.267956, 32.368561, 32.469727, 
32.569916, 32.669949, 32.773418, 32.874977, 32.976883, 33.078552, 
33.181709, 33.282593, 33.385487, 33.486103, 33.588486, 33.690254, 
33.793388, 33.893703, 33.993759), value = c(0.473294, 0.484845, 0.457187, 
0.429909, 0.401957, 0.373661, 0.346553, 0.319149, 0.291235, 0.264531, 
0.237448, 0.210861, 0.183152, 0.156354, 0.128166, 0.100545, 0.072625, 
0.04256, 0.012583, -0.018004, -0.04997, -0.081763, -0.113793, 
-0.148269, -0.182154, -0.216842, -0.252688, -0.289105, -0.325591, 
-0.365179, -0.403439, -0.444497, -0.485201, -0.52848, -0.572731, 
-0.619512, -0.667557, -0.717658, -0.769489, -0.823372, -0.876394, 
-0.932901, -0.991948, -1.0498, -1.109068, -1.166309, -1.219613, 
-1.266242, -1.304891, -1.33146, -1.344302, -1.343445, -1.328416, 
-1.298611, -1.253834, -1.193182, -1.119025, -1.030697, -0.929913, 
-0.817493, -0.698029, -0.56889, -0.437583, -0.301298, -0.165372, 
-0.028239, 0.102254, 0.22927), Z = c(NA, "{\"type\":\"M\",\"msg\":\"FIRST\",\"time\":27.2498,\"dist\":0.410454}", 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, "{\"type\":\"M\",\"msg\":\"LAST0\",\"time\":31.2458,\"dist\":0.521487}", 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 
-68L), class = c("tbl_df", "tbl", "data.frame"))

CodePudding user response：

Consider this dataframe:

          t     value           Z
0  27.21666  0.473294        <NA>
1  27.31775  0.484845 {msg:FIRST}
2  27.41881  0.457187        <NA>
3  27.51916  0.429909        <NA>
4  27.62018  0.401957        <NA>
5  27.72040  0.373661        <NA>
6  27.82057  0.346553        <NA>
7  27.92117  0.319149 {msg:LAST0}
32 30.45133 -0.485201        <NA>
33 30.55292 -0.528480        <NA>
34 30.65318 -0.572731        <NA>
35 30.75435 -0.619512        <NA>
36 30.85619 -0.667557 {msg:FIRST}
37 30.95747 -0.717658        <NA>
38 31.06031 -0.769489        <NA>
39 31.16094 -0.823372        <NA>
40 31.26099 -0.876394 {msg:LAST0}
41 31.36135 -0.932901        <NA>
50 32.26796 -1.344302        <NA>

With tidyverse functions, you can first use cumsum to identify groups between FIRST and LAST0, then filter out rows that are not between those two values, and then use mutate to compute the mean and sd.

library(dplyr)

df %>% 
  mutate(cum = cumsum(grepl("FIRST", Z)   lag(grepl("LAST0", Z), default = 0))) %>% 
  filter(cum %% 2 == 1) %>% 
  group_by(cum = as.numeric(as.factor(cum))) %>% 
  mutate(mean = mean(value),
            sd = sd(value))

# A tibble: 12 x 6
# Groups:   cum [2]
       t  value Z             cum   mean     sd
   <dbl>  <dbl> <chr>       <dbl>  <dbl>  <dbl>
 1  27.3  0.485 {msg:FIRST}     1  0.402 0.0598
 2  27.4  0.457 <NA>            1  0.402 0.0598
 3  27.5  0.430 <NA>            1  0.402 0.0598
 4  27.6  0.402 <NA>            1  0.402 0.0598
 5  27.7  0.374 <NA>            1  0.402 0.0598
 6  27.8  0.347 <NA>            1  0.402 0.0598
 7  27.9  0.319 {msg:LAST0}     1  0.402 0.0598
 8  30.9 -0.668 {msg:FIRST}     2 -0.771 0.0828
 9  31.0 -0.718 <NA>            2 -0.771 0.0828
10  31.1 -0.769 <NA>            2 -0.771 0.0828
11  31.2 -0.823 <NA>            2 -0.771 0.0828
12  31.3 -0.876 {msg:LAST0}     2 -0.771 0.0828

data

structure(list(t = c(27.21666, 27.31775, 27.41881, 27.51916, 
27.62018, 27.7204, 27.82057, 27.92117, 30.45133, 30.55292, 30.65318, 
30.75435, 30.85619, 30.95747, 31.06031, 31.16094, 31.26099, 31.36135, 
32.26796), value = c(0.473294, 0.484845, 0.457187, 0.429909, 
0.401957, 0.373661, 0.346553, 0.319149, -0.485201, -0.52848, 
-0.572731, -0.619512, -0.667557, -0.717658, -0.769489, -0.823372, 
-0.876394, -0.932901, -1.344302), Z = c("<NA>", "{msg:FIRST}", 
"<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "{msg:LAST0}", "<NA>", 
"<NA>", "<NA>", "<NA>", "{msg:FIRST}", "<NA>", "<NA>", "<NA>", 
"{msg:LAST0}", "<NA>", "<NA>")), class = "data.frame", row.names = c("0", 
"1", "2", "3", "4", "5", "6", "7", "32", "33", "34", "35", "36", 
"37", "38", "39", "40", "41", "50"))