Horizontal table summarizing events in r-CodePudding

I have a dataset, where I import data from SQL, that looks like this:

ID  Name  Date
1   John  2021-09-05
1   John  2021-08-14
1   John  2021-08-21
2   Kate  2021-03-04
3   Mary  2021-07-10
3   Mary  2021-06-13
4   Joel  2021-05-10
4   Joel  2021-05-31
4   Joel  2021-04-13
4   Joel  2021-07-11
4   Joel  2021-03-31
5   Mike  2021-04-02
5   Mike  2021-02-07

I want to group this by floor_date(df, ‘month’) And to then summarize by how many occurrences per month to give an output in a horizontal table looking like following:

Name | Feb 21 | Mar 21 | Apr 21 | May 21 | Jun 21 | Jul 21 | ....
John     0       0        0         0       0         0      ....
Joel     0       1        1         2       0         1      ....
Kate     0       1        0         0       0         0      ....
Mary     0       0        0         0       1         1      ....
Mike     1       0        1         0       0         0      ....

CodePudding user response：

I think you're just looking for table

library(tidyverse)
library(lubridate)

df %>%
  group_by(Date = floor_date(Date, "month")) %>%
  select(-ID) %>%
  table()
#>       Date
#> Name   2021-02-01 2021-03-01 2021-04-01 2021-05-01 2021-06-01 2021-07-01
#>   Joel          0          1          1          2          0          1
#>   John          0          0          0          0          0          0
#>   Kate          0          1          0          0          0          0
#>   Mary          0          0          0          0          1          1
#>   Mike          1          0          1          0          0          0
#>       Date
#> Name   2021-08-01 2021-09-01
#>   Joel          0          0
#>   John          2          1
#>   Kate          0          0
#>   Mary          0          0
#>   Mike          0          0

If you prefer the output to be a tibble / data frame, you can do:

df %>%
  group_by(Date = floor_date(Date, "month")) %>%
  select(-ID) %>%
  table() %>%
  as_tibble() %>%
  pivot_wider(names_from = Date, values_from = n)
#> # A tibble: 5 x 9
#>   Name  `2021-02-01` `2021-03-01` `2021-04-01` `2021-05-01` `2021-06-01` `2021-07-01` `2021-08-01` `2021-09-01`
#>   <chr>        <int>        <int>        <int>        <int>        <int>        <int>        <int>        <int>
#> 1 Joel             0            1            1            2            0            1            0            0
#> 2 John             0            0            0            0            0            0            2            1
#> 3 Kate             0            1            0            0            0            0            0            0
#> 4 Mary             0            0            0            0            1            1            0            0
#> 5 Mike             1            0            1            0            0            0            0            0

^{Created on 2022-10-04 with reprex v2.0.2}

Data from question in reproducible format

df <- structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L), Name = c("John", "John", "John", "Kate", "Mary", 
"Mary", "Joel", "Joel", "Joel", "Joel", "Joel", "Mike", "Mike"
), Date = structure(c(18875, 18853, 18860, 18690, 18818, 18791, 
18757, 18778, 18730, 18819, 18717, 18719, 18665), class = "Date")),
row.names = c(NA,  -13L), class = "data.frame")

CodePudding user response：

One possible solution:

library(tidyr)
library(dplyr)

df |> 
  mutate(Date = strftime(as.Date(Date), "%b-%y")) |> 
  pivot_wider(id_cols=Name, names_from=Date, values_from=Date, values_fn=length, values_fill=0)

# A tibble: 5 x 9
  Name  `Sep-21` `Aug-21` `Mar-21` `Jul-21` `Jun-21` `May-21` `Apr-21` `Feb-21`
  <chr>    <int>    <int>    <int>    <int>    <int>    <int>    <int>    <int>
1 John         1        2        0        0        0        0        0        0
2 Kate         0        0        1        0        0        0        0        0
3 Mary         0        0        0        1        1        0        0        0
4 Joel         0        0        1        1        0        2        1        0
5 Mike         0        0        0        0        0        0        1        1

Or using the built-in functions xtabs and transform

df |> 
  transform(Date = strftime(as.Date(Date), "%b-%y")) |> 
  xtabs(formula=~Name   Date)

      Date
Name   Apr-21 Aug-21 Feb-21 Jul-21 Jun-21 Mar-21 May-21 Sep-21
  Joel      1      0      0      1      0      1      2      0
  John      0      2      0      0      0      0      0      1
  Kate      0      0      0      0      0      1      0      0
  Mary      0      0      0      1      1      0      0      0
  Mike      1      0      1      0      0      0      0      0

Or using data.table

library(data.table)

dcast(setDT(df), Name ~ strftime(as.Date(Date), "%b-%y"), fun=length)

     Name Apr-21 Aug-21 Feb-21 Jul-21 Jun-21 Mar-21 May-21 Sep-21
1:   Joel      1      0      0      1      0      1      2      0
2:   John      0      2      0      0      0      0      0      1
3:   Kate      0      0      0      0      0      1      0      0
4:   Mary      0      0      0      1      1      0      0      0
5:   Mike      1      0      1      0      0      0      0      0