Drop rows in a data frame that are in-between two integer values in R-CodePudding

I have this data frame coming out of certain participant's behaviour in an episodic task, and let's say the episode starts at 90 and finishes when we have a certain trigger that can be in the range of 40s. I am doing a sample dataframe with a column with the number of the rows and the other with the actual triggers.

ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
df <- data.frame(ex1,ex2)

> df
   ex1 ex2
1    1  41
2    2   1
3    3   1
4    4  90
5    5   1
6    6   1
7    7   1
8    8  44
9    9   1
10  10  90
11  11   1
12  12   2
13  13  42
14  14   1
15  15   1
16  16   1
17  17   1
18  18  90
19  19   1
20  20  41

Now, what I am trying to do is remove all the rows that are outside the beginning and the end of the episode, as they are recordings of typed behaviour that is not interesting as it falls outside of the episode. Therefore, I want to end up with a dataframe like this:

ex1 <- c(1,4,5,6,7,8,10,11,12,13,18,19,20)
ex2 <- c(41,90,1,1,1,44,90,1,2,42,90,1,41)
df <- data.frame(ex1,ex2)

> df
   ex1 ex2
1    1  41
2    4  90
3    5   1
4    6   1
5    7   1
6    8  44
7   10  90
8   11   1
9   12   2
10  13  42
11  18  90
12  19   1
13  20  41

I have been trying to use subset but I cannot make it work between a range and a number.

Thanks in advance!

CodePudding user response：

Setting the values:

ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
before <- data.frame(ex1,ex2)
before
ex1 ex2
1    1  41
2    2   1
3    3   1
4    4  90
5    5   1
6    6   1
7    7   1
8    8  44
9    9   1
10  10  90
11  11   1
12  12   2
13  13  42
14  14   1
15  15   1
16  16   1
17  17   1
18  18  90
19  19   1
20  20  41

I have built a function that should do the work. The function is constructed based on my understanding of your problem so there is a chance that my function would not work perfectly to your setting. However I believe you can do your task by adjusting the function a little bit to satisfy your needs.

library(dplyr)
episode <- function(start = 90, end = 40, data){#the default value of start is 90 and the default value of end is 40
  #retrieving all the row indices that correspond to values that indicates an end
  end_idx <- which(data$ex2>=end & data$ex2<=end 10)
  #retrieving all the row indices that correspond to values that indicates a start
  start_idx <- which(data$ex2==start)
  
  #declaring a list that would contain the extracted sub samples in your liking
  sub_sample_list <- vector("list", length(start_idx))
  #looping through the start indices
  for(i in 1:length(start_idx)){
    #extracting the minimum among those have values larger than the i-th start_idx value
    temp_end <- min(end_idx[end_idx>start_idx[i]]) 
    #extracting the rows between the i-th start index and the minimum end index that is larger than the i-th start index
    temp_sub_sample <- data[start_idx[i]:temp_end,]
    #saving the sub-sample in the list
    sub_sample_list[[i]] <- temp_sub_sample
  }
  #now row binding all the extracted sub samples 
  clean.df <- do.call(rbind.data.frame, sub_sample_list)
  #if there is an end index that is smaller than the minimum start index
  if(min(end_idx)< min(start_idx)){
    #only retrieve those corresponding rows and add to the clean.df
    clean.df <- rbind(data[end_idx[end_idx<min(start_idx)],], clean.df)
  }
  #cleaning up the row numbers a bit
  rownames(clean.df) <- 1:nrow(clean.df)
  
  #sort the clean.df by ex1
  clean.df <- clean.df %>% arrange(ex1)
  
  #returning the clean.df
  return(clean.df)
}

Generating the after data set by using the episode function.

after <- episode(start = 90, end = 40, before)
after
ex1 ex2
1    1  41
2    4  90
3    5   1
4    6   1
5    7   1
6    8  44
7   10  90
8   11   1
9   12   2
10  13  42
11  18  90
12  19   1
13  20  41

CodePudding user response：

And base:

ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
df <- data.frame(ex1,ex2)

index start of series [90] and if not row 1 and subset out rows prior to start as incomplete:

start_idx <- which(df$ex2 == 90)
df <- df[start_idx[1]:nrow(df), ]

re-index start and index end >= 40 & < 90

start_idx <- which(df$ex2 == 90)
end_idx <- which(df$ex2 >= 40 & df$ex2 < 90)

make an empty list and for loop through, subsetting out start:end sections

df_lst <- list()
for (k in 1:length(start_idx)) {
   df_lst[[k]] <- df[start_idx[k]:end_idx[k], ]
   }

bring them all together

df2 <- do.call('rbind' df_lst)
df2
   ex1 ex2
4    4  90
5    5   1
6    6   1
7    7   1
8    8  44
10  10  90
11  11   1
12  12   2
13  13  42
18  18  90
19  19   1
20  20  41

Fairly compact.