I have a space separated file, similar to:
# 1_1 AAA ABA AAB BBA
# 1_2 ABA AAA ABB BAA
# 1_3 ABA BAA
# 1_4 AAA BBB ABB
However, the rows' content (let's call them "words") contains more characters (around 20). The number of words in each row varies (from less than 10 to dozens). I want to create an incidence matrix, using R, where each unique word is a column and the row names would remain as row names, like this:
# AAA AAB ABA ABB BAA BBA BBB
# 1_1 1 1 1 0 0 1 0
# 1_2 1 0 1 1 1 0 0
# 1_3 0 0 1 0 1 0 0
# 1_4 1 0 0 1 0 0 1
The column order is not relevant. Any advice is very much appreciated, and both base and package solutions are welcome. Thanks!
CodePudding user response:
tidyverse
library(tidyverse)
df <-
structure(
list(
id = c("1_1", "1_2", "1_3", "1_4"),
col_1 = c("AAA", "ABA", "ABA", "AAA"),
col_2 = c("ABA", "AAA", "BAA", "BBB"),
col_3 = c("AAB", "ABB", NA, "ABB"),
col_4 = c("BBA", "BAA", NA, NA)
),
class = "data.frame",
row.names = c(NA,-4L)
)
df %>%
pivot_longer(-id, values_drop_na = TRUE) %>%
pivot_wider(
id_cols = id,
names_from = value,
values_from = value,
values_fn = length,
values_fill = 0
)
#> # A tibble: 4 x 8
#> id AAA ABA AAB BBA ABB BAA BBB
#> <chr> <int> <int> <int> <int> <int> <int> <int>
#> 1 1_1 1 1 1 1 0 0 0
#> 2 1_2 1 1 0 0 1 1 0
#> 3 1_3 0 1 0 0 0 1 0
#> 4 1_4 1 0 0 0 1 0 1
Created on 2021-12-13 by the reprex package (v2.0.1)
data.table
library(data.table)
library(magrittr)
setDT(df)
melt(df, id.vars = "id") %>%
na.omit() %>%
dcast(formula = id ~ value, fun.aggregate = length)
#> id AAA AAB ABA ABB BAA BBA BBB
#> 1: 1_1 1 1 1 0 0 1 0
#> 2: 1_2 1 0 1 1 1 0 0
#> 3: 1_3 0 0 1 0 1 0 0
#> 4: 1_4 1 0 0 1 0 0 1
Created on 2021-12-13 by the reprex package (v2.0.1)
base
df_stack <- stack(df, select = -id)
df_stack$id <- rep(df$id, ncol(df) - 1)
(with(df_stack, table(id, values)))
#> values
#> id AAA AAB ABA ABB BAA BBA BBB
#> 1_1 1 1 1 0 0 1 0
#> 1_2 1 0 1 1 1 0 0
#> 1_3 0 0 1 0 1 0 0
#> 1_4 1 0 0 1 0 0 1
Created on 2021-12-13 by the reprex package (v2.0.1)
CodePudding user response:
Try the code below
> (t(table(stack(as.data.frame(t(`row.names<-`(df[-1], df$id)))))) > 0)
values
ind AAA AAB ABA ABB BAA BBA BBB
1_1 1 1 1 0 0 1 0
1_2 1 0 1 1 1 0 0
1_3 0 0 1 0 1 0 0
1_4 1 0 0 1 0 0 1
CodePudding user response:
An option with unite/cSplit_e
library(dplyr)
library(tidyr)
library(splitstackshape)
library(stringr)
df %>%
unite(new, starts_with('col'), na.rm = TRUE) %>%
cSplit_e(., "new", sep="_", type = "character", fill = 0, drop = TRUE) %>%
rename_with(~ str_remove(., "new_"), -id)
id AAA AAB ABA ABB BAA BBA BBB
1 1_1 1 1 1 0 0 1 0
2 1_2 1 0 1 1 1 0 0
3 1_3 0 0 1 0 1 0 0
4 1_4 1 0 0 1 0 0 1