I have a column of multiple categories in a comma separated pattern. something Like that
id | categories |
---|---|
1 | A, B, C, D |
2 | A, F, X, G |
3 | B, Y, X, D |
How can I produce two columns with the possible occurrences of two categories, something like that
id | category 1 | category 2 |
---|---|---|
1 | A | B |
1 | A | C |
1 | A | D |
1 | B | C |
1 | B | D |
1 | C | D |
2 | A | F |
2 | A | X |
2 | A | G |
And so on.
Thanks in advance!
CodePudding user response:
You can split and use combn
, i.e.
do.call(rbind, lapply(strsplit(df$categories, ', '), function(i)data.frame(t(combn(i, 2)))))
# X1 X2
#1 A B
#2 A C
#3 A D
#4 B C
#5 B D
#6 C D
#7 A F
#8 A X
#9 A G
#10 F X
#11 F G
#12 X G
#13 B Y
#14 B X
#15 B D
#16 Y X
#17 Y D
#18 X D
CodePudding user response:
Another base R
do.call(
rbind,
lapply(
split(df,df$id),
function(x){
cbind(
x$id,
t(combn(strsplit(x$categories,", ")[[1]],2))
)
}
)
)
[,1] [,2] [,3]
[1,] "1" "A" "B"
[2,] "1" "A" "C"
[3,] "1" "A" "D"
[4,] "1" "B" "C"
[5,] "1" "B" "D"
[6,] "1" "C" "D"
[7,] "2" "A" "F"
[8,] "2" "A" "X"
[9,] "2" "A" "G"
[10,] "2" "F" "X"
[11,] "2" "F" "G"
[12,] "2" "X" "G"
[13,] "3" "B" "Y"
[14,] "3" "B" "X"
[15,] "3" "B" "D"
[16,] "3" "Y" "X"
[17,] "3" "Y" "D"
[18,] "3" "X" "D"
CodePudding user response:
A solution using tidyverse:
- Use
strsplit()
(orstringr::str_split()
to obtain each of the categories from the original data. - Split the data by
id
and then generate a sub-dataframe for that id with each of the possible combinations. - Join the tables back together (this step can conveniently with the same function as step 2 using
purrr::map_df()
).
library(tidyverse)
data %>%
mutate(all = str_split(categories, ", ")) %>%
split(.$id) %>%
map_df(function(df) {
combs = t(combn(unlist(df$all), m = 2))
tibble(id = df$id, cat_1 = combs[, 1], cat_2 = combs[, 2])
})
Output
# A tibble: 18 x 3
id cat_1 cat_2
<dbl> <chr> <chr>
1 1 A B
2 1 A C
3 1 A D
4 1 B C
5 1 B D
6 1 C D
7 2 A F
8 2 A X
9 2 A G
10 2 F X
11 2 F G
12 2 X G
13 3 B Y
14 3 B X
15 3 B D
16 3 Y X
17 3 Y D
18 3 X D
CodePudding user response:
A data.table
option
> setDT(df)[, data.table(t(combn(scan(text = categories, what = "character", sep = ","), 2))), id]
Read 4 items
Read 4 items
Read 4 items
id V1 V2
1: 1 A B
2: 1 A C
3: 1 A D
4: 1 B C
5: 1 B D
6: 1 C D
7: 2 A F
8: 2 A X
9: 2 A G
10: 2 F X
11: 2 F G
12: 2 X G
13: 3 B Y
14: 3 B X
15: 3 B D
16: 3 Y X
17: 3 Y D
18: 3 X D
Or, we can use dplyr
pipeline like below
df %>%
group_by(id) %>%
mutate(categories = list(data.frame(t(combn(unlist(strsplit(categories, ", ")), 2))))) %>%
unnest(categories) %>%
ungroup()
which gives
id X1 X2
<int> <chr> <chr>
1 1 A B
2 1 A C
3 1 A D
4 1 B C
5 1 B D
6 1 C D
7 2 A F
8 2 A X
9 2 A G
10 2 F X
11 2 F G
12 2 X G
13 3 B Y
14 3 B X
15 3 B D
16 3 Y X
17 3 Y D
Data
> dput(df)
structure(list(id = 1:3, categories = c("A, B, C, D", "A, F, X, G",
"B, Y, X, D")), class = "data.frame", row.names = c(NA, -3L))
CodePudding user response:
Using gregexpr
.
z <- dat$categories
t(do.call(cbind, lapply(regmatches(z, gregexpr(z, pa='\\w')), combn, 2)))
# [,1] [,2]
# [1,] "A" "B"
# [2,] "A" "C"
# [3,] "A" "D"
# [4,] "B" "C"
# [5,] "B" "D"
# [6,] "C" "D"
# [7,] "A" "F"
# [8,] "A" "X"
# [9,] "A" "G"
# [10,] "F" "X"
# [11,] "F" "G"
# [12,] "X" "G"
# [13,] "B" "Y"
# [14,] "B" "X"
# [15,] "B" "D"
# [16,] "Y" "X"
# [17,] "Y" "D"
# [18,] "X" "D"
By ID's
do.call(rbind.data.frame, by(dat, dat$categories, \(x) {
z <- x$categories
cbind(id=x$id,
t(do.call(cbind, lapply(regmatches(z, gregexpr(z, pa='\\w')), combn, 2))))
}))
# id X1 X2
# A, B, C, D.1 1 A B
# A, B, C, D.2 1 A C
# A, B, C, D.3 1 A D
# A, B, C, D.4 1 B C
# A, B, C, D.5 1 B D
# A, B, C, D.6 1 C D
# A, F, X, G.1 2 A F
# A, F, X, G.2 2 A X
# A, F, X, G.3 2 A G
# A, F, X, G.4 2 F X
# A, F, X, G.5 2 F G
# A, F, X, G.6 2 X G
# B, Y, X, D.1 3 B Y
# B, Y, X, D.2 3 B X
# B, Y, X, D.3 3 B D
# B, Y, X, D.4 3 Y X
# B, Y, X, D.5 3 Y D
# B, Y, X, D.6 3 X D
Note: "R version 4.1.2 (2021-11-01)"
Data:
dat <- structure(list(id = 1:3, categories = c("A, B, C, D", "A, F, X, G",
"B, Y, X, D")), class = "data.frame", row.names = c(NA, -3L))