Subset dataframe on time interval determined by pattern-CodePudding

I have this type of data:

df <- structure(list(Line = c("129", "130", "131", "132", "133", "134", "135", 
                              "136", "137", "138", "139", "140", "141", "142", "143", "144", 
                              "145"), 
                     Actor = c("R", "R", "R", "R", "R", "B", "R", "B", "B", "B", 
                               "M", "M", "M", "M", "M", "W", "M"), 
                     Act_cat = c("SpeechRec", "ver", "SpeechRec","ges", "ges", "gaze", "ges", "gaze", "gaze", "gaze", "gaze", 
                                  "gaze", "gaze", "gaze", "gaze", "gaze", "gaze"), 
                     Activity = c("hey", "dort drüben die sparrenburg", 
                                  "schwert", "D-onset", "D-peak", "~", "D-retract", "@tum", "~", "@tum", "~", "@tum", 
                                  "~", "@tum", "~", "~", "@tum"), 
                     Starttime_ms = c(46616, 48825, 48865, 49220, 50080, 50730, 50900, 51009, 51191, 51270, 51486, 51809, 
                                      52251, 52333, 53227, 53267, 53429), 
                   Endtime_ms = c(47616,53035, 49865, 50080, 50900, 51009, 52220, 51191, 51270, 53474, 51808, 52250, 
                                  52332, 53226, 53428, 53524, 53606)), 
                row.names = 129:145, class = "data.frame")

What I need to do is slice/filter that subset of rows where Starttime_ms is >= the Starttime_ms of the pattern sparrenburg in column Activity and Endtime_ms is <= the Endtime_ms of the same pattern sparrenburg in column Activity.

I've tried these two subsetting methods but neither works correctly:

library(dplyr)
df %>% slice(which(Starttime_ms >= Starttime_ms[str_detect(Activity, "sparrenburg")])
             :
             which(Endtime_ms <= Endtime_ms[str_detect(Activity, "sparrenburg")]))

and:

df %>% filter(between(Line, 
                      Starttime_ms >= Starttime_ms[str_detect(Activity, "sparrenburg")], 
                      Endtime_ms <= Endtime_ms[str_detect(Activity, "sparrenburg")]))

How can I subset so that the result is this:

130  130     R       ver dort drüben die sparrenburg        48825      53035
131  131     R SpeechRec                     schwert        48865      49865
132  132     R       ges                     D-onset        49220      50080
133  133     R       ges                      D-peak        50080      50900
134  134     B      gaze                           ~        50730      51009
135  135     R       ges                   D-retract        50900      52220
136  136     B      gaze                        @tum        51009      51191
137  137     B      gaze                           ~        51191      51270
138  138     B      gaze                        @tum        51270      53474
139  139     M      gaze                           ~        51486      51808
140  140     M      gaze                        @tum        51809      52250
141  141     M      gaze                           ~        52251      52332
142  142     M      gaze                        @tum        52333      53226

CodePudding user response：

You were close with the conditions you set up, but you need to provide them to dplyr::filter() connected with the logical and operator & to require both. Because you may have multiple rows that satisfy the condition str_detect(Activity, "sparrenburg"), you can just take the min() and max() to get the most extreme ones for the comparison.

library(tidyverse)

df <- structure(list(Line = c("129", "130", "131", "132", "133", "134", "135", "136", "137", "138", "139", "140", "141", "142", "143", "144", "145"), Actor = c("R", "R", "R", "R", "R", "B", "R", "B", "B", "B", "M", "M", "M", "M", "M", "W", "M"), Act_cat = c("SpeechRec", "ver", "SpeechRec","ges", "ges", "gaze", "ges", "gaze", "gaze", "gaze", "gaze", "gaze", "gaze", "gaze", "gaze", "gaze", "gaze"), Activity = c("hey", "dort drüben die sparrenburg", "schwert", "D-onset", "D-peak", "~", "D-retract", "@tum", "~", "@tum", "~", "@tum", "~", "@tum", "~", "~", "@tum"), Starttime_ms = c(46616, 48825, 48865, 49220, 50080, 50730, 50900, 51009, 51191, 51270, 51486, 51809, 52251, 52333, 53227, 53267, 53429), Endtime_ms = c(47616,53035, 49865, 50080, 50900, 51009, 52220, 51191, 51270, 53474, 51808, 52250, 52332, 53226, 53428, 53524, 53606)), row.names = 129:145, class = "data.frame")

df %>% 
  filter(
    Starttime_ms >= min(Starttime_ms[str_detect(Activity, "sparrenburg")], na.rm = T) &
       Endtime_ms <= max(Endtime_ms[str_detect(Activity, "sparrenburg")], na.rm = T)
  )
#>    Line Actor   Act_cat                    Activity Starttime_ms Endtime_ms
#> 1   130     R       ver dort drüben die sparrenburg        48825      53035
#> 2   131     R SpeechRec                     schwert        48865      49865
#> 3   132     R       ges                     D-onset        49220      50080
#> 4   133     R       ges                      D-peak        50080      50900
#> 5   134     B      gaze                           ~        50730      51009
#> 6   135     R       ges                   D-retract        50900      52220
#> 7   136     B      gaze                        @tum        51009      51191
#> 8   137     B      gaze                           ~        51191      51270
#> 9   139     M      gaze                           ~        51486      51808
#> 10  140     M      gaze                        @tum        51809      52250
#> 11  141     M      gaze                           ~        52251      52332

^{Created on 2022-02-16 by the reprex package (v2.0.1)}

CodePudding user response：

You can do that with base indexing.

starttime <- df[["Starttime_ms"]][stringr::str_detect(df$Activity, "sparrenburg")]
stoptime <- df[["Endtime_ms"]][stringr::str_detect(df$Activity, "sparrenburg")]
slice_df <- df[df["Starttime_ms"] >= starttime & df["Endtime_ms"] <= stoptime, ]

> slice_df
    Line Actor   Act_cat                    Activity Starttime_ms Endtime_ms
130  130     R       ver dort drüben die sparrenburg        48825      53035
131  131     R SpeechRec                     schwert        48865      49865
132  132     R       ges                     D-onset        49220      50080
133  133     R       ges                      D-peak        50080      50900
134  134     B      gaze                           ~        50730      51009
135  135     R       ges                   D-retract        50900      52220
136  136     B      gaze                        @tum        51009      51191
137  137     B      gaze                           ~        51191      51270
139  139     M      gaze                           ~        51486      51808
140  140     M      gaze                        @tum        51809      52250
141  141     M      gaze                           ~        52251      52332