Home > Software engineering >  R: Pivoting multi-year spells into multiple year rows
R: Pivoting multi-year spells into multiple year rows

Time:10-20

I have a dataset where each row represents a continuous spells with start and end months and years. For spells which are over more than one year, I want to pivot them so that there is one row per year.

Input:

library(data.table)

dat <- data.table(id = c(1,1,2), b_sp_y = c(2008, 2009, 2011), b_sp_m = c(3, 8, 6), 
                  e_sp_y = c(2008, 2010, 2013), e_sp_m = c(5, 1, 9))

   id b_sp_y b_sp_m e_sp_y e_sp_m
1:  1   2008      3   2008      5
2:  1   2009      8   2010      1
3:  2   2011      6   2013      9

Here is my truly horrifyingly ugly code:

dat[, y_dif := e_sp_y - b_sp_y]
res <- dat[y_dif == 0][, c("e_sp_y", "y_dif") := NULL]
setnames(res, "b_sp_y", "year")

tmp <- dat[y_dif > 0]
for(i in 1:nrow(tmp)){
  foo <- tmp[i, ]
  foo2 <- data.table(year = foo$b_sp_y:(foo$b_sp_y   foo$y_dif))[,id := foo$id]
  foo2[, b_sp_m := c(foo$b_sp_m, rep(1, foo$y_dif))]
  foo2[, e_sp_m := c(rep(12, foo$y_dif), foo$e_sp_m)]
  res <- rbind(res, foo2)
}

Output:

   id year b_sp_m e_sp_m
1:  1 2008      3      5
2:  1 2009      8     12
3:  1 2010      1      1
4:  2 2011      6     12
5:  2 2012      1     12
6:  2 2013      1      9

This ugly and slow to a crawl, but I couldn't really come up with anything better. Thanks for your help!

CodePudding user response:

I'd suggest: make a date sequence for each id/row, group by id and year, summarize first and last month.

library(dplyr); library(lubridate)
dat %>%
  mutate(start = ymd(paste(b_sp_y, b_sp_m, "01", sep = "-")),
         end   = ymd(paste(e_sp_y, e_sp_m, "01", sep = "-"))) %>%
  group_by(id, row  = row_number()) %>%
  summarize(months  = seq.Date(start, end, by = "month")) %>%
  group_by(id, year = year(months)) %>%
  summarize(from = month(min(months)),
            to   = month(max(months)), .groups = "drop")

Result:

# A tibble: 6 × 4
     id  year  from    to
  <dbl> <dbl> <dbl> <dbl>
1     1  2008     3     5
2     1  2009     8    12
3     1  2010     1     1
4     2  2011     6    12
5     2  2012     1    12
6     2  2013     1     9

CodePudding user response:

We create a sequence column 'rn', loop over the year columns, get the sequence in a list, unnest the column, and do a group by the 'rn' and replace the 'b', 'e' columns where there are duplicates to 1 and 12 respectively

library(dplyr)
library(purrr)
library(tidyr)
dat %>%
   mutate(rn=row_number(), 
   year = map2(b_sp_y, e_sp_y, `:`),
    b_sp_y= NULL,
     e_sp_y = NULL) %>% 
  unnest(year) %>% 
  group_by(rn) %>%
  mutate(b_sp_m = replace(b_sp_m, duplicated(b_sp_m), 1),
   e_sp_m = replace(e_sp_m, duplicated(e_sp_m, fromLast = TRUE) & 
      n() > 1, 12)) %>% 
  ungroup %>%
  select(-rn) %>%
  relocate(year, .after = 1)

-output

# A tibble: 6 × 4
     id  year b_sp_m e_sp_m
  <dbl> <int>  <dbl>  <dbl>
1     1  2008      3      5
2     1  2009      8     12
3     1  2010      1      1
4     2  2011      6     12
5     2  2012      1     12
6     2  2013      1      9

OP's output of 'res'

> res
      id  year b_sp_m e_sp_m
   <num> <num>  <num>  <num>
1:     1  2008      3      5
2:     1  2009      8     12
3:     1  2010      1      1
4:     2  2011      6     12
5:     2  2012      1     12
6:     2  2013      1      9

CodePudding user response:

Proceeding by row fill in the three columns using summarize as shown.

library(data.table)
library(dplyr)

dat %>%
  rowwise() %>%
  summarize(id = id,
            year = b_sp_y:e_sp_y,
            b_sp_m = replace(1   0 * year, 1, b_sp_m),
            e_sp_m = replace(12   0 * year, length(year), e_sp_m))

giving:

# A tibble: 6 × 4
     id  year b_sp_m e_sp_m
  <dbl> <int>  <dbl>  <dbl>
1     1  2008      3      5
2     1  2009      8     12
3     1  2010      1      1
4     2  2011      6     12
5     2  2012      1     12
6     2  2013      1      9

or using only data.table:

library(data.table)

dat[, .(id = id, 
  year = b_sp_y:e_sp_y,
  b_sp_m = replace(1   0 * b_sp_y:e_sp_y, 1, b_sp_m),
  e_sp_m = replace(12   0 * b_sp_y:e_sp_y, e_sp_y - b_sp_y   1, e_sp_m)), 
  by = 1:nrow(dat)][, -1]
  • Related