Home > Mobile >  subset dataset into 2 smaller ones
subset dataset into 2 smaller ones

Time:07-21

How can I subset the following dataset based on the fact that Numeric_variable includes 2 for some units (e.g. A), and 3 for some other units (e.g. B)?

Unit | Numeric_variable
A    | 1
A    | 1
A    | 2
A    | 2
A    | 2
A    | 2
C    | 1
C    | 1
C    | 2
C    | 2
C    | 2
C    | 2
B    | 1
B    | 1
B    | 3
B    | 3
B    | 3
B    | 3
D    | 1
D    | 1
D    | 3
D    | 3
D    | 3
D    | 3

Into:

df1:

Unit | Numeric_variable
A    | 1
A    | 1
A    | 2
A    | 2
A    | 2
A    | 2
C    | 1
C    | 1
C    | 2
C    | 2
C    | 2
C    | 2

df2:

Unit | Numeric_variable
B    | 1
B    | 1
B    | 3
B    | 3
B    | 3
B    | 3
D    | 1
D    | 1
D    | 3
D    | 3
D    | 3
D    | 3

By doing df1 <- df[which(df$Numeric_variable != 3),], I obtain the below which is not what I would like to get. I would like to get the above df1.

Unit | Numeric_variable
A    | 1
A    | 1
A    | 2
A    | 2
A    | 2
A    | 2
B    | 1
B    | 1
D    | 1
D    | 1

CodePudding user response:

Here is a possible approach:

library(dplyr)
library(tidyr)

df_all <- df %>% 
  mutate(x = case_when(Numeric_variable==2 ~ 2,
                       Numeric_variable==3 ~ 3,
                       TRUE ~ NA_real_)) %>% 
  fill(x, .direction = "up") 

df_2 <- df_all %>% 
  filter(x == 2) %>% 
  select(-x)

df_3 <- df_all %>% 
  filter(x == 3) %>% 
  select(-x)
> df_2
   Unit Numeric_variable
1     A                1
2     A                1
3     A                2
4     A                2
5     A                2
6     A                2
7     C                1
8     C                1
9     C                2
10    C                2
11    C                2
12    C                2
> df_3
   Unit Numeric_variable
1     B                1
2     B                1
3     B                3
4     B                3
5     B                3
6     B                3
7     D                1
8     D                1
9     D                3
10    D                3
11    D                3
12    D                3

CodePudding user response:

We can use split from base R

grp <- (with(df, match(Unit, unique(Unit)))-1) %/% 2   1
lst1 <- split(df, grp)
names(lst1) <- paste0("df", seq_along(lst1))

and then create objects in the global env from the list elements with list2env

list2env(lst1, .GlobalEnv)

Now, check for 'df1' and 'df2' in the global env

> df1
   Unit Numeric_variable
1     A                1
2     A                1
3     A                2
4     A                2
5     A                2
6     A                2
7     C                1
8     C                1
9     C                2
10    C                2
11    C                2
12    C                2
> df2
   Unit Numeric_variable
13    B                1
14    B                1
15    B                3
16    B                3
17    B                3
18    B                3
19    D                1
20    D                1
21    D                3
22    D                3
23    D                3
24    D                3

If it needs at least one 2 and one 3, then subset the data for 2, 3, create a new column 'grp' and then split after mergeing

tmp <- merge(df, transform(unique(subset(df, 
   Numeric_variable %in% 
   2:3)), grp = as.integer(factor(Numeric_variable)))[-2],
     all.x = TRUE, by = 'Unit')
lst1 <- split(tmp[-3], tmp$grp)
names(lst1) <- paste0("df", seq_along(lst1))
list2env(lst1, .GlobalEnv)

data

df <- structure(list(Unit = c("A", "A", "A", "A", "A", "A", "C", "C", 
"C", "C", "C", "C", "B", "B", "B", "B", "B", "B", "D", "D", "D", 
"D", "D", "D"), Numeric_variable = c(1L, 1L, 2L, 2L, 2L, 2L, 
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 
3L, 3L)), class = "data.frame", row.names = c(NA, -24L))
  • Related