transformation of dataframe applied on 3 variables-CodePudding

I wrote several commands to transform a dataframe but i would like to simplify the code that I wrote in four parts. Part 1,2 and 3 are to make calculation of column 1, 2 and 3 (count the number of time a value is repeated for each column and complete for missing number comprised between 0 and the max of value of the three column). The fourth part is to join the previous output.

I would like to simplify it in order to make the transformation of the 3 column in one block of code instead of 4. Is it possible to do it without using function ?

Thank you in advance.

set.seed(1234)

# Data

A=sample(0:10, 20, replace = TRUE)
B=sample(0:10, 20, replace = TRUE)
C=sample(0:10, 20, replace = TRUE)

df=data.frame(A,B,C)

  A  B  C
1  9  2  0
2  5  3  5
3  4  9  7
4  8  4  2
5  4  1  5
6  5  7  0
7  3 10  0
8  1  3  8
9  6  2  7
10 5  6  9
11 9  8  0
12 5  2 10
13 3  5  7
14 7  3  9
15 3  7  5
16 3  9  2
17 4 10  8
18 7  1  2
19 3  4  5
20 7  5  8

# Count for A
df2=data.frame(A=0:max(max(df$A),max(df$B),max(df$C)))

df3_A= df %>%
  select(A) %>%
  group_by(A) %>%
  mutate(A_number= n()) %>%
  distinct(A_number, .keep_all = TRUE) %>%
  ungroup() %>%
  complete (df2)

df3_A$A_number[is.na(df3_A$A_number)]=0

# Count for B

df2=data.frame(B=0:max(max(df$A),max(df$B),max(df$C)))

df3_B= df %>%
  select(B) %>%
  group_by(B) %>%
  mutate(B_number= n()) %>%
  distinct(B_number, .keep_all = TRUE) %>%
  ungroup() %>%
  complete (df2)

df3_B$B_number[is.na(df3_B$B_number)]=0

# Count for C

df2=data.frame(C=0:max(max(df$A),max(df$B),max(df$C)))

df3_C= df %>%
  select(C) %>%
  group_by(C) %>%
  mutate(C_number= n()) %>%
  distinct(C_number, .keep_all = TRUE) %>%
  ungroup() %>%
  complete (df2)

df3_C$C_number[is.na(df3_C$C_number)]=0

# Join

df3= df3_A %>%
  left_join(df3_B, by=c("A"="B")) %>%
  left_join(df3_C, by=c("A"="C"))

A A_number B_number C_number
   <int>    <dbl>    <dbl>    <dbl>
 1     0        0        0        4
 2     1        1        2        0
 3     2        0        3        3
 4     3        5        3        0
 5     4        3        2        0
 6     5        4        2        4
 7     6        1        1        0
 8     7        3        2        3
 9     8        1        1        3
10     9        2        2        2
11    10        0        2        1

CodePudding user response：

Using base: stack and table:

as.data.frame.matrix(table(stack(df)))
#    A B C
# 0  0 0 4
# 1  1 2 0
# 2  0 3 3
# 3  5 3 0
# 4  3 2 0
# 5  4 2 4
# 6  1 1 0
# 7  3 2 3
# 8  1 1 3
# 9  2 2 2
# 10 0 2 1

CodePudding user response：

You can reshape to long, count the values by variables, then reshape back to wide filling missings with zero:

library(dplyr)
library(tidyr)

df %>% 
  pivot_longer(cols = everything()) %>%
  count(name, value) %>%
  pivot_wider(values_from = n, values_fill = 0) %>%
  arrange(value)

# A tibble: 11 × 4
   value     A     B     C
   <int> <int> <int> <int>
 1     0     0     0     4
 2     1     1     2     0
 3     2     0     3     3
 4     3     5     3     0
 5     4     3     2     0
 6     5     4     2     4
 7     6     1     1     0
 8     7     3     2     3
 9     8     1     1     3
10     9     2     2     2
11    10     0     2     1

CodePudding user response：

You can use vctrs::vec_count over the columns and then merge the data.frames altogether:

library(dplyr)
library(purrr)
df %>% 
  mutate(across(A:C, factor, levels = 0:10, ordered = TRUE)) %>% 
  map(vctrs::vec_count) %>% 
  imap(~ {name <- paste0("count", .y) %>%
  rename_with(.x, ~ name, count)}) %>% 
  reduce(full_join, by = "key") %>% 
  replace(is.na(.), 0) %>% 
  arrange(key)

output

   key countA countB countC
1    0      0      0      4
2    1      1      2      0
3    2      0      3      3
4    3      5      3      0
5    4      3      2      0
6    5      4      2      4
7    6      1      1      0
8    7      3      2      3
9    8      1      1      3
10   9      2      2      2
11  10      0      2      1