I've got a dataframe like the one below (in the actual dataset the number of rows are a few thousands and I 've got more than 300 variables):
df <- data.frame (Gr = c("A","A","A","B","B","B","B","B","B"),
Var1 = c("a","b","c","e","a","a","c","e","b"),
Var2 = c("a","a","a","d","b","b","c","a","e"),
Var3 = c("e","a","b",NA,"a","b","c","d","a"),
Var4 = c("e",NA,"a","e","a","b","d","c",NA))
which returns:
Gr Var1 Var2 Var3 Var4
1 A a a e e
2 A b a a <NA>
3 A c a b a
4 B e d <NA> e
5 B a b a a
6 B a b b b
7 B c c c d
8 B e a d c
9 B b e a <NA>
and would like to get number of occurrences of each value (a,b,c,d,e, and NA) in each variable and in each group. Hence, the output should look something like the following:
df1 <- data.frame(Vars = c("Var1","Var2","Var3","Var4"),
a = c(1,3,1,1),
b = c(1,0,1,0),
c = c(1,0,0,0),
d = c(0,0,0,0),
e = c(0,0,1,1),
na = c(0,0,0,1))
df2 <- data.frame(Vars = c("Var1","Var2","Var3","Var4"),
a = c(2,1,2,1),
b = c(0,2,1,1),
c = c(1,1,1,1),
d = c(0,1,1,1),
e = c(2,1,0,1),
na = c(0,0,1,1))
output <- list(df1,df2)
names(output) <- c("A","B")
which looks like:
$A
Vars a b c d e na
1 Var1 1 1 1 0 0 0
2 Var2 3 0 0 0 0 0
3 Var3 1 1 0 0 1 0
4 Var4 1 0 0 0 1 1
$B
Vars a b c d e na
1 Var1 2 0 1 0 2 0
2 Var2 1 2 1 1 1 0
3 Var3 2 1 1 1 0 1
4 Var4 1 1 1 1 1 1
I haven't been able to make any considerable progress so far, and a tidyverse solution is preferred.
CodePudding user response:
We may use mtabulate
after split
ing
library(qdapTools)
lapply(split(df[-1], df$Gr), mtabulate)
If we need the na
count as well
lapply(split(replace(df[-1], is.na(df[-1]), "na"), df$Gr), mtabulate)
-output
$A
a b c e na
Var1 1 1 1 0 0
Var2 3 0 0 0 0
Var3 1 1 0 1 0
Var4 1 0 0 1 1
$B
a b c d e na
Var1 2 1 1 0 2 0
Var2 1 2 1 1 1 0
Var3 2 1 1 1 0 1
Var4 1 1 1 1 1 1
Or using tidyverse
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -Gr, names_to = "Vars") %>%
pivot_wider(names_from = value, values_from = value,
values_fn = length, values_fill = 0) %>%
{split(.[-1], .$Gr)}
-output
$A
# A tibble: 4 × 7
Vars a e b `NA` c d
<chr> <int> <int> <int> <int> <int> <int>
1 Var1 1 0 1 0 1 0
2 Var2 3 0 0 0 0 0
3 Var3 1 1 1 0 0 0
4 Var4 1 1 0 1 0 0
$B
# A tibble: 4 × 7
Vars a e b `NA` c d
<chr> <int> <int> <int> <int> <int> <int>
1 Var1 2 2 1 0 1 0
2 Var2 1 1 2 0 1 1
3 Var3 2 0 1 1 1 1
4 Var4 1 1 1 1 1 1
CodePudding user response:
A NA
save base R approach using colSums
val <- sort(unique(unlist(df[-1])), na.last=T)
as.list(lapply(split(df[-1], df$Gr), function(dlist)
data.frame(sapply(val, function(x)
colSums(dlist == x | (is.na(dlist) & is.na(x)), na.rm=T)), check.names=F)))
$A
a b c d e NA
Var1 1 1 1 0 0 0
Var2 3 0 0 0 0 0
Var3 1 1 0 0 1 0
Var4 1 0 0 0 1 1
$B
a b c d e NA
Var1 2 1 1 0 2 0
Var2 1 2 1 1 1 0
Var3 2 1 1 1 0 1
Var4 1 1 1 1 1 1
CodePudding user response:
reshape2::recast(df,Gr variable~value,length,id.var = 'Gr')
Gr variable a b c d e NA
1 A Var1 1 1 1 0 0 0
2 A Var2 3 0 0 0 0 0
3 A Var3 1 1 0 0 1 0
4 A Var4 1 0 0 0 1 1
5 B Var1 2 1 1 0 2 0
6 B Var2 1 2 1 1 1 0
7 B Var3 2 1 1 1 0 1
If you must split them:
split(reshape2::recast(df,Gr variable~value,length,id.var = 'Gr'), ~Gr)
$A
Gr variable a b c d e NA
1 A Var1 1 1 1 0 0 0
2 A Var2 3 0 0 0 0 0
3 A Var3 1 1 0 0 1 0
4 A Var4 1 0 0 0 1 1
$B
Gr variable a b c d e NA
5 B Var1 2 1 1 0 2 0
6 B Var2 1 2 1 1 1 0
7 B Var3 2 1 1 1 0 1
8 B Var4 1 1 1 1 1 1
in base R:
ftable(cbind(df[1], stack(replace(df, is.na(df),'na'), -1)),col.vars = 2)
values a b c d e na
Gr ind
A Var1 1 1 1 0 0 0
Var2 3 0 0 0 0 0
Var3 1 1 0 0 1 0
Var4 1 0 0 0 1 1
B Var1 2 1 1 0 2 0
Var2 1 2 1 1 1 0
Var3 2 1 1 1 0 1
Var4 1 1 1 1 1 1