Home > Mobile >  subset dataset into 2 smaller ones while keeping common units
subset dataset into 2 smaller ones while keeping common units

Time:07-21

How can I subset the following dataset based on the fact that Numeric_variable includes 2 for some units (e.g. A,C), and 3 for some other units (e.g. B,D)? I would like to keep all units that have 1,1,1,1,1,1 (e.g. E,F) in each group. Data provided below.

Unit | Numeric_variable
A    | 1
A    | 1
A    | 2
A    | 2
A    | 2
A    | 2
C    | 1
C    | 1
C    | 2
C    | 2
C    | 2
C    | 2
B    | 1
B    | 1
B    | 3
B    | 3
B    | 3
B    | 3
D    | 1
D    | 1
D    | 3
D    | 3
D    | 3
D    | 3
E    | 1
E    | 1
E    | 1
E    | 1
E    | 1
E    | 1
F    | 1
F    | 1
F    | 1
F    | 1
F    | 1
F    | 1

Into:

df1:

Unit | Numeric_variable
A    | 1
A    | 1
A    | 2
A    | 2
A    | 2
A    | 2
C    | 1
C    | 1
C    | 2
C    | 2
C    | 2
C    | 2
E    | 1
E    | 1
E    | 1
E    | 1
E    | 1
E    | 1
F    | 1
F    | 1
F    | 1
F    | 1
F    | 1
F    | 1

df2:

Unit | Numeric_variable
B    | 1
B    | 1
B    | 3
B    | 3
B    | 3
B    | 3
D    | 1
D    | 1
D    | 3
D    | 3
D    | 3
D    | 3
E    | 1
E    | 1
E    | 1
E    | 1
E    | 1
E    | 1
F    | 1
F    | 1
F    | 1
F    | 1
F    | 1
F    | 1

data

Unit <- c("A", "A", "A", "A", "A", "A", "C", "C", 
"C", "C", "C", "C", "B", "B", "B", "B", "B", "B", "D", "D", "D", 
"D", "D", "D","E", "E", "E", "E", "E", "E","F", "F", "F", "F", "F", "F")
Numeric_variable <- c(1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 3, 3, 3, 3L, 1, 1, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
df <- data.frame(Unit, Numeric_variable)

CodePudding user response:

We could this almost with the same code. Just and | is.na(x) to filter:

library(dplyr)
library(tidyr)

df_all <- df %>% 
  mutate(x = case_when(Numeric_variable==2 ~ 2,
                       Numeric_variable==3 ~ 3,
                       TRUE ~ NA_real_)) %>% 
  fill(x, .direction = "up") 

df_2 <- df_all %>% 
  filter(x == 2 | is.na(x)) %>% 
  select(-x)

df_3 <- df_all %>% 
  filter(x == 3 | is.na(x)) %>% 
  select(-x)
df_2 
   Unit Numeric_variable
1     A                1
2     A                1
3     A                2
4     A                2
5     A                2
6     A                2
7     C                1
8     C                1
9     C                2
10    C                2
11    C                2
12    C                2
13    E                1
14    E                1
15    E                1
16    E                1
17    E                1
18    E                1
19    F                1
20    F                1
21    F                1
22    F                1
23    F                1
24    F                1

> df_3
   Unit Numeric_variable
1     B                1
2     B                1
3     B                3
4     B                3
5     B                3
6     B                3
7     D                1
8     D                1
9     D                3
10    D                3
11    D                3
12    D                3
13    E                1
14    E                1
15    E                1
16    E                1
17    E                1
18    E                1
19    F                1
20    F                1
21    F                1
22    F                1
23    F                1
24    F                1

CodePudding user response:

We could use subset - create a logical vector where the 'Numeric_variable' is 2 (or 3 for df2), subset the 'Unit' based on the logical vector, concatenate with 'E', 'F' and create another logical vector with %in% on the original 'Unit' column

df1 <- subset(df, Unit %in% c(Unit[Numeric_variable == 2], c("E", "F")))
df2 <- subset(df, Unit %in% c(Unit[Numeric_variable == 3], c("E", "F")))

-output

> df1
   Unit Numeric_variable
1     A                1
2     A                1
3     A                2
4     A                2
5     A                2
6     A                2
7     C                1
8     C                1
9     C                2
10    C                2
11    C                2
12    C                2
25    E                1
26    E                1
27    E                1
28    E                1
29    E                1
30    E                1
31    F                1
32    F                1
33    F                1
34    F                1
35    F                1
36    F                1
> df2
   Unit Numeric_variable
13    B                1
14    B                1
15    B                3
16    B                3
17    B                3
18    B                3
19    D                1
20    D                1
21    D                3
22    D                3
23    D                3
24    D                3
25    E                1
26    E                1
27    E                1
28    E                1
29    E                1
30    E                1
31    F                1
32    F                1
33    F                1
34    F                1
35    F                1
36    F                1

If the 'E', 'F' cases needs to be determined programmatically

df1 <- subset(df, !Unit %in% Unit[Numeric_variable != 1]|
     Unit %in% Unit[Numeric_variable == 2])
df2 <- subset(df, !Unit %in% Unit[Numeric_variable != 1]|
     Unit %in% Unit[Numeric_variable == 3])

-output

> df1
   Unit Numeric_variable
1     A                1
2     A                1
3     A                2
4     A                2
5     A                2
6     A                2
7     C                1
8     C                1
9     C                2
10    C                2
11    C                2
12    C                2
25    E                1
26    E                1
27    E                1
28    E                1
29    E                1
30    E                1
31    F                1
32    F                1
33    F                1
34    F                1
35    F                1
36    F                1
> df2
   Unit Numeric_variable
13    B                1
14    B                1
15    B                3
16    B                3
17    B                3
18    B                3
19    D                1
20    D                1
21    D                3
22    D                3
23    D                3
24    D                3
25    E                1
26    E                1
27    E                1
28    E                1
29    E                1
30    E                1
31    F                1
32    F                1
33    F                1
34    F                1
35    F                1
36    F                1

CodePudding user response:

With data.table:

library(data.table)

dt <- data.table(
  Unit = rep(LETTERS[c(1,3,2,4:6)], each = 6),
  Numeric_variable = c(1,1,2,2,2,2,1,1,2,2,2,2,1,1,3,3,3,3,1,1,3,3,3,3,1,1,1,1,1,1,1,1,1,1,1,1)
)

dt1 <- dt[, .(Numeric_variable = if(any(Numeric_variable == 3)) Numeric_variable[0] else Numeric_variable), Unit]
dt2 <- dt[, .(Numeric_variable = if(any(Numeric_variable == 2)) Numeric_variable[0] else Numeric_variable), Unit]

dt1
#>     Unit Numeric_variable
#>  1:    A                1
#>  2:    A                1
#>  3:    A                2
#>  4:    A                2
#>  5:    A                2
#>  6:    A                2
#>  7:    C                1
#>  8:    C                1
#>  9:    C                2
#> 10:    C                2
#> 11:    C                2
#> 12:    C                2
#> 13:    E                1
#> 14:    E                1
#> 15:    E                1
#> 16:    E                1
#> 17:    E                1
#> 18:    E                1
#> 19:    F                1
#> 20:    F                1
#> 21:    F                1
#> 22:    F                1
#> 23:    F                1
#> 24:    F                1
#>     Unit Numeric_variable
dt2
#>     Unit Numeric_variable
#>  1:    B                1
#>  2:    B                1
#>  3:    B                3
#>  4:    B                3
#>  5:    B                3
#>  6:    B                3
#>  7:    D                1
#>  8:    D                1
#>  9:    D                3
#> 10:    D                3
#> 11:    D                3
#> 12:    D                3
#> 13:    E                1
#> 14:    E                1
#> 15:    E                1
#> 16:    E                1
#> 17:    E                1
#> 18:    E                1
#> 19:    F                1
#> 20:    F                1
#> 21:    F                1
#> 22:    F                1
#> 23:    F                1
#> 24:    F                1
  • Related