I am working with the R programming language.
I have this dataset in R:
factor <- c(1,2,3,4,5,6,7,8,9,10)
var_1 <- as.factor(sample(factor, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))
var_2 <- as.factor(sample(factor, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))
var_3 <- as.factor(sample(factor, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))
var_4 <- as.factor(sample(factor, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))
var_5 <- as.factor(sample(factor, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))
my_data = data.frame(var_1, var_2, var_3, var_4, var_5)
head(my_data)
var_1 var_2 var_3 var_4 var_5
1 10 9 1 1 3
2 3 10 7 2 6
3 7 2 6 2 5
4 4 3 9 9 5
5 10 7 2 6 8
6 2 3 9 1 4
I also have another dataset in R containing "conditions":
conditions <- structure(list(iter = 1:5, n_1 = c(9L, 2L, 4L, 1L, 10L), n_2 = c(7L,
6L, 9L, 6L, 6L), n_3 = c(8L, 8L, 5L, 4L, 9L), n_4 = c(5L, 10L,
5L, 2L, 5L), n_5 = c(3L, 6L, 1L, 5L, 1L), cond_1 = list(c(5L,
10L, 8L, 1L, 9L, 2L, 7L, 3L, 4L), 7:8, c(5L, 8L, 3L, 1L), 8L,
c(10L, 2L, 9L, 3L, 5L, 8L, 4L, 6L, 7L, 1L)), cond_2 = list(
c(7L, 2L, 10L, 8L, 6L, 4L, 3L), c(5L, 1L, 7L, 8L, 6L, 10L
), c(5L, 1L, 4L, 3L, 6L, 10L, 7L, 8L, 9L), c(7L, 3L, 2L,
1L, 6L, 4L), c(1L, 9L, 3L, 7L, 8L, 4L)), cond_3 = list(c(9L,
6L, 8L, 4L, 5L, 10L, 3L, 1L), c(8L, 2L, 7L, 5L, 4L, 1L, 9L, 3L
), c(9L, 3L, 10L, 1L, 4L), c(3L, 1L, 2L, 5L), c(8L, 4L, 6L, 7L,
5L, 1L, 3L, 9L, 2L)), cond_4 = list(c(9L, 5L, 6L, 7L, 3L), c(9L,
5L, 3L, 6L, 2L, 1L, 7L, 10L, 8L, 4L), c(8L, 10L, 1L, 7L, 6L),
c(1L, 7L), c(7L, 6L, 5L, 8L, 10L)), cond_5 = list(c(2L, 5L,
9L), c(2L, 1L, 10L, 3L, 6L, 7L), 8L, c(10L, 6L, 2L, 8L, 1L),
6L)), class = "data.frame", row.names = c(NA, -5L))
head(conditions)
iter n_1 n_2 n_3 n_4 n_5 cond_1 cond_2 cond_3 cond_4 cond_5
1 1 9 7 8 5 3 5, 10, 8, 1, 9, 2, 7, 3, 4 7, 2, 10, 8, 6, 4, 3 9, 6, 8, 4, 5, 10, 3, 1 9, 5, 6, 7, 3 2, 5, 9
2 2 2 6 8 10 6 7, 8 5, 1, 7, 8, 6, 10 8, 2, 7, 5, 4, 1, 9, 3 9, 5, 3, 6, 2, 1, 7, 10, 8, 4 2, 1, 10, 3, 6, 7
3 3 4 9 5 5 1 5, 8, 3, 1 5, 1, 4, 3, 6, 10, 7, 8, 9 9, 3, 10, 1, 4 8, 10, 1, 7, 6 8
4 4 1 6 4 2 5 8 7, 3, 2, 1, 6, 4 3, 1, 2, 5 1, 7 10, 6, 2, 8, 1
5 5 10 6 9 5 1 10, 2, 9, 3, 5, 8, 4, 6, 7, 1 1, 9, 3, 7, 8, 4 8, 4, 6, 7, 5, 1, 3, 9, 2 7, 6, 5, 8, 10 6
Based on these two data sets, I would like to create the 5 following data sets:
data_1 = my_data[which(my_data$var_1 %in% conditions[1,7] & my_data$var_2 %in% conditions[1,8] & my_data$var_3 %in% conditions[1,9] & my_data$var_4 %in% conditions[1,10] & my_data$var_5 %in% conditions[1,11]), ]
data_2 = my_data[which(my_data$var_1 %in% conditions[2,7] & my_data$var_2 %in% conditions[2,8] & my_data$var_3 %in% conditions[2,9] & my_data$var_4 %in% conditions[2,10] & my_data$var_5 %in% conditions[2,11]), ]
data_3 = my_data[which(my_data$var_1 %in% conditions[3,7] & my_data$var_2 %in% conditions[3,8] & my_data$var_3 %in% conditions[3,9] & my_data$var_4 %in% conditions[3,10] & my_data$var_5 %in% conditions[3,11]), ]
data_4 = my_data[which(my_data$var_1 %in% conditions[4,7] & my_data$var_2 %in% conditions[4,8] & my_data$var_3 %in% conditions[4,9] & my_data$var_4 %in% conditions[4,10] & my_data$var_5 %in% conditions[4,11]), ]
data_5 = my_data[which(my_data$var_1 %in% conditions[5,7] & my_data$var_2 %in% conditions[5,8] & my_data$var_3 %in% conditions[5,9] & my_data$var_4 %in% conditions[5,10] & my_data$var_5 %in% conditions[5,11]), ]
My Problem: Each of these datasets is returning an empty dataset:
> data_1
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_2
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_3
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_4
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_5
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
Can someone please show me what I am doing wrong and what can I do to fix this problem?
For each of these datasets (data_1, data_2, data_3, data_4, data_5), I would like to select rows where :
- var_1 in any of the entries of the corresponding row in cond_1 AND
- var_2 in any of the entries of the corresponding row in cond_2 AND
- var_3 in any of the entries of the corresponding row in cond_3 AND
- var_4 in any of the entries of the corresponding row in cond_4 AND
- var_5 in any of the entries of the corresponding row in cond_5
Thanks!
Note 1: These are OR conditions e.g. Based on the random data from this example:
data_1 :
var_1 in (5 OR 10 OR 8 OR 1 OR 9 OR 2 OR 7 OR 3 OR 4) AND var_2 in (7 OR 2 OR 10 OR 8 OR 6 OR 4 OR 3 ) AND var_3 in (9 OR 6 OR 8 OR 4 OR 5 OR 10 OR 3 OR 1 ) AND var_4 in ( 9 OR 5 OR 6 OR 7 OR 3) AND var_5 in ( 2 OR 5 OR 9)
data_2 ... etc.
data_3 ... etc.
data_4 ... etc.
data_5 ... etc.
Note 2: If possible, I would like to keep as much of the code in Base R as possible.
CodePudding user response:
The problem is, that you have lists in your conditions.
class(conditions[1,7])
# [1] "list"
Solution: unlist
.
my_data[my_data$var_1 %in% unlist(conditions[1,7]) &
my_data$var_2 %in% unlist(conditions[1,8]) &
my_data$var_3 %in% unlist(conditions[1,9]) &
my_data$var_4 %in% unlist(conditions[1,10]) &
my_data$var_5 %in% unlist(conditions[1,11]), ]
# var_1 var_2 var_3 var_4 var_5
# 3 9 6 5 3 9
# 6 9 2 6 7 9
# 21 2 10 5 9 5
Consider this small example:
1:5
# [1] 1 2 3 4 5
list(2:4)
# [[1]]
# [1] 2 3 4
unlist(list(2:4))
# [1] 2 3 4
The conditions are one level deeper:
1:5 %in% list(2:4)
# [1] FALSE FALSE FALSE FALSE FALSE
1:5 %in% unlist(list(2:4))
# [1] FALSE TRUE TRUE TRUE FALSE