I wrote several commands to transform a dataframe but i would like to simplify the code that I wrote in four parts. Part 1,2 and 3 are to make calculation of column 1, 2 and 3 (count the number of time a value is repeated for each column and complete for missing number comprised between 0 and the max of value of the three column). The fourth part is to join the previous output.
I would like to simplify it in order to make the transformation of the 3 column in one block of code instead of 4. Is it possible to do it without using function ?
Thank you in advance.
set.seed(1234)
# Data
A=sample(0:10, 20, replace = TRUE)
B=sample(0:10, 20, replace = TRUE)
C=sample(0:10, 20, replace = TRUE)
df=data.frame(A,B,C)
A B C
1 9 2 0
2 5 3 5
3 4 9 7
4 8 4 2
5 4 1 5
6 5 7 0
7 3 10 0
8 1 3 8
9 6 2 7
10 5 6 9
11 9 8 0
12 5 2 10
13 3 5 7
14 7 3 9
15 3 7 5
16 3 9 2
17 4 10 8
18 7 1 2
19 3 4 5
20 7 5 8
# Count for A
df2=data.frame(A=0:max(max(df$A),max(df$B),max(df$C)))
df3_A= df %>%
select(A) %>%
group_by(A) %>%
mutate(A_number= n()) %>%
distinct(A_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_A$A_number[is.na(df3_A$A_number)]=0
# Count for B
df2=data.frame(B=0:max(max(df$A),max(df$B),max(df$C)))
df3_B= df %>%
select(B) %>%
group_by(B) %>%
mutate(B_number= n()) %>%
distinct(B_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_B$B_number[is.na(df3_B$B_number)]=0
# Count for C
df2=data.frame(C=0:max(max(df$A),max(df$B),max(df$C)))
df3_C= df %>%
select(C) %>%
group_by(C) %>%
mutate(C_number= n()) %>%
distinct(C_number, .keep_all = TRUE) %>%
ungroup() %>%
complete (df2)
df3_C$C_number[is.na(df3_C$C_number)]=0
# Join
df3= df3_A %>%
left_join(df3_B, by=c("A"="B")) %>%
left_join(df3_C, by=c("A"="C"))
A A_number B_number C_number
<int> <dbl> <dbl> <dbl>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1
CodePudding user response:
Using base: stack and table:
as.data.frame.matrix(table(stack(df)))
# A B C
# 0 0 0 4
# 1 1 2 0
# 2 0 3 3
# 3 5 3 0
# 4 3 2 0
# 5 4 2 4
# 6 1 1 0
# 7 3 2 3
# 8 1 1 3
# 9 2 2 2
# 10 0 2 1
CodePudding user response:
You can reshape to long, count the values by variables, then reshape back to wide filling missings with zero:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything()) %>%
count(name, value) %>%
pivot_wider(values_from = n, values_fill = 0) %>%
arrange(value)
# A tibble: 11 × 4
value A B C
<int> <int> <int> <int>
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1
CodePudding user response:
You can use vctrs::vec_count
over the columns and then merge the data.frames altogether:
library(dplyr)
library(purrr)
df %>%
mutate(across(A:C, factor, levels = 0:10, ordered = TRUE)) %>%
map(vctrs::vec_count) %>%
imap(~ {name <- paste0("count", .y) %>%
rename_with(.x, ~ name, count)}) %>%
reduce(full_join, by = "key") %>%
replace(is.na(.), 0) %>%
arrange(key)
output
key countA countB countC
1 0 0 0 4
2 1 1 2 0
3 2 0 3 3
4 3 5 3 0
5 4 3 2 0
6 5 4 2 4
7 6 1 1 0
8 7 3 2 3
9 8 1 1 3
10 9 2 2 2
11 10 0 2 1