Home > other >  Pivot Wider causing issues when as.yearmon is used
Pivot Wider causing issues when as.yearmon is used

Time:12-20

I have the following code:

library(zoo)
library(xts)
df1<-structure(list(Date = structure(c(13523, 13532, 13539, 13551, 
                              13565, 13567, 13579, 13588, 13600, 13607, 13616, 13628, 13637, 
                              13656, 13658, 13670, 13686, 13691, 13698, 13705, 13721, 13735, 
                              13768, 13770, 13783, 13789, 13797, 13811, 13819, 13824, 13838, 
                              13846, 13852, 13860), class = "Date"), Category = c("Type 1", 
                                                                                  "Type 2", "Type 1", "Type 1", "Type 1", "Type 2", "Type 1", "Type 3", 
                                                                                  "Type 1", "Type 1", "Type 2", "Type 1", "Type 1", "Type 1", "Type 2", 
                                                                                  "Type 1", "Type 3", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2", 
                                                                                  "Type 1", "Type 3", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2", 
                                                                                  "Type 1", "Type 1", "Type 1", "Type 3", "Type 2"), Value = c(2250, 
                                                                                                                                               1200, 625, 2250, 1000, 2750, 2250, 2750, 950, 2000, 1100, 950, 
                                                                                                                                               2250, 1000, 2500, 2250, 2500, 1000, 2250, 1200, 700, 2500, 2000, 
                                                                                                                                               2500, 900, 2250, 1200, 925, 2500, 2250, 750, 2000, 2500, 950)), class = c("grouped_df", 
                                                                                                                                                                                                                         "tbl_df", "tbl", "data.frame"), row.names = c(NA, -34L), groups = structure(list(
                                                                                                                                                                                                                           Date = structure(c(13523, 13532, 13539, 13551, 13565, 13567, 
                                                                                                                                                                                                                                              13579, 13588, 13600, 13607, 13616, 13628, 13637, 13656, 13658, 
                                                                                                                                                                                                                                              13670, 13686, 13691, 13698, 13705, 13721, 13735, 13768, 13770, 
                                                                                                                                                                                                                                              13783, 13789, 13797, 13811, 13819, 13824, 13838, 13846, 13852, 
                                                                                                                                                                                                                                              13860), class = "Date"), .rows = structure(list(1L, 2L, 3L, 
                                                                                                                                                                                                                                                                                              4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 
                                                                                                                                                                                                                                                                                              16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 
                                                                                                                                                                                                                                                                                              27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L), ptype = integer(0), class = c("vctrs_list_of", 
                                                                                                                                                                                                                                                                                                                                                                     "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
                                                                                                                                                                                                                                                                                                                                                                     ), row.names = c(NA, -34L), .drop = TRUE))
   

I've created a rolling_sum by month for this particular dataset using:

df_month <- df1 %>%
  group_by(Category, Month = format(Date, "%Y-%m-%d")) %>%
  summarize(Rolling_Sum = sum(Value))


df_month$Month <- as.yearmon(df_month$Month)
                                                                                                                                                                                                                                             

In preparation for a conversion to an xts format I'd like to pivot-wider and replace all null/NAs values with 0. However the pivot-wider seems to break the dataset, making the null replacement and xts conversion impossible:

df_turned <- df_month %>% group_by(Category) %>% pivot_wider(names_from = Category, values_from = Rolling_Sum, id_cols = Month)

If that had worked, I would have done:

df_turned <- df_turned %>% replace(.=="NULL", 0)

Then:

df_turned <- xts(df_turned, order.by = df_turned$Month)

Any advice most appreciated.

CodePudding user response:

If we don't want duplicates, then use values_fn

library(tidyr)
library(dplyr)
df_turned <- df_month %>% 
  ungroup %>%
  pivot_wider(names_from = Category, values_from = Rolling_Sum, 
    values_fn = sum, values_fill = 0)

-output

df_turned
# A tibble: 12 × 4
   Month     `Type 1` `Type 2` `Type 3`
   <yearmon>    <dbl>    <dbl>    <dbl>
 1 Jan 2007      2875     1200        0
 2 Feb 2007      3250     2750        0
 3 Mar 2007      3200        0     2750
 4 Apr 2007      2950     1100        0
 5 May 2007      3250     2500        0
 6 Jun 2007      3250        0     2500
 7 Jul 2007      4150        0        0
 8 Sep 2007      2900        0     2500
 9 Oct 2007      4375        0        0
10 Nov 2007      5000     2500        0
11 Aug 2007         0     2500        0
12 Dec 2007         0      950     2500

Now, we can convert to xts

 xts(df_turned[-1], order.by = df_turned$Month)
         Type 1 Type 2 Type 3
Jan 2007   2875   1200      0
Feb 2007   3250   2750      0
Mar 2007   3200      0   2750
Apr 2007   2950   1100      0
May 2007   3250   2500      0
Jun 2007   3250      0   2500
Jul 2007   4150      0      0
Aug 2007      0   2500      0
Sep 2007   2900      0   2500
Oct 2007   4375      0      0
Nov 2007   5000   2500      0
Dec 2007      0    950   2500

CodePudding user response:

As indicated in my comment, your problem is that you create duplicates because as.yearmon is called after the grouping by "Month". You are de facto grouping by "Date". We could do:

library(dplyr)
library(tidyr)

df1 |>
    group_by(Category,
             Month = as.yearmon(Date)) |>
    pivot_wider(names_from = Category,
                values_from = Value,
                values_fn = sum,
                values_fill = 0
    ) |>
    select(-Date) # Or mutate "Date" above instead of creating "Month".

Then call xts.

Month = as.yearmon(Date) shouldn't cause a problem if Date is a date-type. However, if it is causing trouble as you indicate in your comment, as.yearmon(format(Date, "%Y-%m-%d")).

Output:

# A tibble: 12 × 4
   Month     `Type 1` `Type 2` `Type 3`
   <yearmon>    <dbl>    <dbl>    <dbl>
 1 Jan 2007      2875     1200        0
 2 Feb 2007      3250     2750        0
 3 Mar 2007      3200        0     2750
 4 Apr 2007      2950     1100        0
 5 May 2007      3250     2500        0
 6 Jun 2007      3250        0     2500
 7 Jul 2007      4150        0        0
 8 Sep 2007      2900        0     2500
 9 Oct 2007      4375        0        0
10 Nov 2007      5000     2500        0
11 Aug 2007         0     2500        0
12 Dec 2007         0      950     2500

Update After @akrun updated answer with a similar solution, my solution seems more verbose. The reason is that my approach works directly on the df1 object and solves the problem there.

CodePudding user response:

Use read.zoo like this:

library(zoo)

df_month |>
  read.zoo(index = "Month", split = "Category", aggregate = sum) |>
  na.fill(0)

giving this zoo object -- as.xts can be used to convert that to xts if needed.

         Type 1 Type 2 Type 3
Jan 2007   2875   1200      0
Feb 2007   3250   2750      0
Mar 2007   3200      0   2750
Apr 2007   2950   1100      0
May 2007   3250   2500      0
Jun 2007   3250      0   2500
Jul 2007   4150      0      0
Aug 2007      0   2500      0
Sep 2007   2900      0   2500
Oct 2007   4375      0      0
Nov 2007   5000   2500      0
Dec 2007      0    950   2500

or directly from df1 modified from the comment below

df1 |>
  read.zoo(df1, FUN = as.yearmon, split = "Category", aggregate = sum) |>
  na.fill(0)

Note

df_month from question in immediately reproducible form

df_month <- 
structure(list(Category = c("Type 1", "Type 1", "Type 1", "Type 1", 
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", 
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", 
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2", "Type 2", 
"Type 2", "Type 2", "Type 2", "Type 2", "Type 2", "Type 3", "Type 3", 
"Type 3", "Type 3"), Month = structure(c(2007, 2007, 2007.08333333333, 
2007.08333333333, 2007.16666666667, 2007.16666666667, 2007.25, 
2007.25, 2007.33333333333, 2007.33333333333, 2007.41666666667, 
2007.41666666667, 2007.5, 2007.5, 2007.5, 2007.66666666667, 2007.66666666667, 
2007.75, 2007.75, 2007.75, 2007.83333333333, 2007.83333333333, 
2007.83333333333, 2007, 2007.08333333333, 2007.25, 2007.33333333333, 
2007.58333333333, 2007.83333333333, 2007.91666666667, 2007.16666666667, 
2007.41666666667, 2007.66666666667, 2007.91666666667), class = "yearmon"), 
    Rolling_Sum = c(2250, 625, 2250, 1000, 2250, 950, 2000, 950, 
    2250, 1000, 2250, 1000, 2250, 1200, 700, 2000, 900, 2250, 
    1200, 925, 2250, 750, 2000, 1200, 2750, 1100, 2500, 2500, 
    2500, 950, 2750, 2500, 2500, 2500)), row.names = c(NA, -34L
), groups = structure(list(Category = c("Type 1", "Type 2", "Type 3"
), .rows = structure(list(1:23, 24:30, 31:34), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))
  • Related