I have this data frame coming out of certain participant's behaviour in an episodic task, and let's say the episode starts at 90 and finishes when we have a certain trigger that can be in the range of 40s. I am doing a sample dataframe with a column with the number of the rows and the other with the actual triggers.
ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
df <- data.frame(ex1,ex2)
> df
ex1 ex2
1 1 41
2 2 1
3 3 1
4 4 90
5 5 1
6 6 1
7 7 1
8 8 44
9 9 1
10 10 90
11 11 1
12 12 2
13 13 42
14 14 1
15 15 1
16 16 1
17 17 1
18 18 90
19 19 1
20 20 41
Now, what I am trying to do is remove all the rows that are outside the beginning and the end of the episode, as they are recordings of typed behaviour that is not interesting as it falls outside of the episode. Therefore, I want to end up with a dataframe like this:
ex1 <- c(1,4,5,6,7,8,10,11,12,13,18,19,20)
ex2 <- c(41,90,1,1,1,44,90,1,2,42,90,1,41)
df <- data.frame(ex1,ex2)
> df
ex1 ex2
1 1 41
2 4 90
3 5 1
4 6 1
5 7 1
6 8 44
7 10 90
8 11 1
9 12 2
10 13 42
11 18 90
12 19 1
13 20 41
I have been trying to use subset
but I cannot make it work between a range and a number.
Thanks in advance!
CodePudding user response:
Setting the values:
ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
before <- data.frame(ex1,ex2)
before
ex1 ex2
1 1 41
2 2 1
3 3 1
4 4 90
5 5 1
6 6 1
7 7 1
8 8 44
9 9 1
10 10 90
11 11 1
12 12 2
13 13 42
14 14 1
15 15 1
16 16 1
17 17 1
18 18 90
19 19 1
20 20 41
I have built a function that should do the work. The function is constructed based on my understanding of your problem so there is a chance that my function would not work perfectly to your setting. However I believe you can do your task by adjusting the function a little bit to satisfy your needs.
library(dplyr)
episode <- function(start = 90, end = 40, data){#the default value of start is 90 and the default value of end is 40
#retrieving all the row indices that correspond to values that indicates an end
end_idx <- which(data$ex2>=end & data$ex2<=end 10)
#retrieving all the row indices that correspond to values that indicates a start
start_idx <- which(data$ex2==start)
#declaring a list that would contain the extracted sub samples in your liking
sub_sample_list <- vector("list", length(start_idx))
#looping through the start indices
for(i in 1:length(start_idx)){
#extracting the minimum among those have values larger than the i-th start_idx value
temp_end <- min(end_idx[end_idx>start_idx[i]])
#extracting the rows between the i-th start index and the minimum end index that is larger than the i-th start index
temp_sub_sample <- data[start_idx[i]:temp_end,]
#saving the sub-sample in the list
sub_sample_list[[i]] <- temp_sub_sample
}
#now row binding all the extracted sub samples
clean.df <- do.call(rbind.data.frame, sub_sample_list)
#if there is an end index that is smaller than the minimum start index
if(min(end_idx)< min(start_idx)){
#only retrieve those corresponding rows and add to the clean.df
clean.df <- rbind(data[end_idx[end_idx<min(start_idx)],], clean.df)
}
#cleaning up the row numbers a bit
rownames(clean.df) <- 1:nrow(clean.df)
#sort the clean.df by ex1
clean.df <- clean.df %>% arrange(ex1)
#returning the clean.df
return(clean.df)
}
Generating the after
data set by using the episode
function.
after <- episode(start = 90, end = 40, before)
after
ex1 ex2
1 1 41
2 4 90
3 5 1
4 6 1
5 7 1
6 8 44
7 10 90
8 11 1
9 12 2
10 13 42
11 18 90
12 19 1
13 20 41
CodePudding user response:
And base:
ex1 <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)
ex2 <- c(41,1,1,90,1,1,1,44,1,90,1,2,42,1,1,1,1,90,1,41)
df <- data.frame(ex1,ex2)
index start of series [90] and if not row 1 and subset out rows prior to start as incomplete:
start_idx <- which(df$ex2 == 90)
df <- df[start_idx[1]:nrow(df), ]
re-index start and index end >= 40 & < 90
start_idx <- which(df$ex2 == 90)
end_idx <- which(df$ex2 >= 40 & df$ex2 < 90)
make an empty list and for loop through, subsetting out start:end sections
df_lst <- list()
for (k in 1:length(start_idx)) {
df_lst[[k]] <- df[start_idx[k]:end_idx[k], ]
}
bring them all together
df2 <- do.call('rbind' df_lst)
df2
ex1 ex2
4 4 90
5 5 1
6 6 1
7 7 1
8 8 44
10 10 90
11 11 1
12 12 2
13 13 42
18 18 90
19 19 1
20 20 41
Fairly compact.