Home > Back-end >  Rearranging data by year
Rearranging data by year

Time:03-13

I have a table with 30,000 observations and a snippet looks like this:

x <- rep(c("TX"), times=10)
y <- rep(c("CA"), times=10)
z <- rep(c("WI"), times=10)
State <- c(x, y, z)

Proj_ID <- c("TX01", "TX02", "TX03", "TX04", "TX05", "TX06", "TX07", "TX08", "TX09", "TX10", "CA01", 
"CA02", "CA03", "CA04", "CA05", "CA06", "CA07", "CA08", "CA09", "CA10", "WI01", "WI02", "WI03", "WI04", 
"WI05", "WI06", "WI07", "WI08", "WI09", "WI10")

Year <- c("2005", "2013", "2011", "2005", "2008", "2011", "2007", "2010", "2012", "2015", "2005", "2013", 
"2007", "2010", "2012", "2015", "2011", "2005", "2008", "2011", "2007", "2010", "2012", "2015", "2011", 
"2005", "2008", "2005", "2008", "2011")

Unit <- c(67, 35, 9, 35, 20, 56, 12, 40, 28, 34, 13, 19, 67, 45, 98, 25, 40, 32, 28, 55, 51, 
34, 40, 90, 24, 57, 80, 95, 60, 118)

CT <- c("TXA", "TXC", "TXA", "TXB", "TXB", "TXA", "TXD", "TXE", "TXB", "TXD", "CAA", "CAA", 
"CAA", "CAB", "CAC", "CAC", "CAC", "CAE", "CAD", "CAA", "WID", "WID", "WIA", "WIB", "WIB", 
"WID", "WIE", "WIA", "WIE", "WID")

df <-data.frame(State, CT, Year, Proj_ID, Unit)

print(df)


   State  CT Year Proj_ID Unit
1     TX TXA 2005    TX01   67
2     TX TXC 2013    TX02   35
3     TX TXA 2011    TX03    9
4     TX TXB 2005    TX04   35
5     TX TXB 2008    TX05   20
6     TX TXA 2011    TX06   56
7     TX TXD 2007    TX07   12
8     TX TXE 2010    TX08   40
9     TX TXB 2012    TX09   28
10    TX TXD 2015    TX10   34
11    CA CAA 2005    CA01   13
12    CA CAA 2013    CA02   19
13    CA CAA 2007    CA03   67
14    CA CAB 2010    CA04   45
15    CA CAC 2012    CA05   98
16    CA CAC 2015    CA06   25
17    CA CAC 2011    CA07   40
18    CA CAE 2005    CA08   32
19    CA CAD 2008    CA09   28
20    CA CAA 2011    CA10   55
21    WI WID 2007    WI01   51
22    WI WID 2010    WI02   34
23    WI WIA 2012    WI03   40
24    WI WIB 2015    WI04   90
25    WI WIB 2011    WI05   24
26    WI WID 2005    WI06   57
27    WI WIE 2008    WI07   80
28    WI WIA 2005    WI08   95
29    WI WIE 2008    WI09   60
30    WI WID 2011    WI10  118

I would like to get the data organized in the following manner, where I can get the data for each CT for different Year (in some cases additive numbers) where CTs can have one or more Proj_ID for each year. Please notice, for CT WIE, I have two Proj_ID that sum up to 140 for 2008.

 State   CT       2005   2006   2007  2008    2009   2010   2011   2012   2013   2014   2015
 1 CA    CAA       13      0     67      0      0      0     55      0     19      0      0
 2 CA    CAB        0      0      0      0      0     45      0      0      0      0      0
 3 CA    CAC        0      0      0      0      0      0     40     98      0      0     25
 4 CA    CAD        0      0      0     28      0      0      0      0      0      0      0
 5 CA    CAE       32      0      0      0      0      0      0      0      0      0      0
 6 TX    TXA       67      0      0      0      0      0     65      0      0      0      0
 7 TX    TXB       35      0      0     20      0      0      0     28     35      0      0
 8 TX    TXC        0      0      0      0      0      0      0      0      0      0      0
 9 TX    TXD        0      0     12      0      0      0      0      0      0      0     34
10 TX    TXE        0      0      0      0      0     40      0      0      0      0      0
11 WI    WIA       95      0      0      0      0      0      0     40      0      0      0
12 WI    WIB        0      0      0      0      0      0     24      0      0      0     90
13 WI    WID       87      0     51      0      0     34    118      0      0      0      0
14 WI    WIE        0      0      0    140      0      0      0      0      0      0      0

How do I do it in RStudio?

Much appreciate your time and help!

CodePudding user response:

Here is one option with dcast and specify drop = c(TRUE, FALSE) to not drop the unused cases by column

library(data.table)
dcast(setDT(df), State   CT ~ factor(Year, levels = min(Year):max(Year)), 
      value.var = "Unit", sum, drop = c(TRUE, FALSE))

-output

Key: <State, CT>
     State     CT  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014  2015
    <char> <char> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
 1:     CA    CAA    13     0    67     0     0     0    55     0    19     0     0
 2:     CA    CAB     0     0     0     0     0    45     0     0     0     0     0
 3:     CA    CAC     0     0     0     0     0     0    40    98     0     0    25
 4:     CA    CAD     0     0     0    28     0     0     0     0     0     0     0
 5:     CA    CAE    32     0     0     0     0     0     0     0     0     0     0
 6:     TX    TXA    67     0     0     0     0     0    65     0     0     0     0
 7:     TX    TXB    35     0     0    20     0     0     0    28     0     0     0
 8:     TX    TXC     0     0     0     0     0     0     0     0    35     0     0
 9:     TX    TXD     0     0    12     0     0     0     0     0     0     0    34
10:     TX    TXE     0     0     0     0     0    40     0     0     0     0     0
11:     WI    WIA    95     0     0     0     0     0     0    40     0     0     0
12:     WI    WIB     0     0     0     0     0     0    24     0     0     0    90
13:     WI    WID    57     0    51     0     0    34   118     0     0     0     0
14:     WI    WIE     0     0     0   140     0     0     0     0     0     0     0

CodePudding user response:

A possible solution, based on tidyr::pivot_wider:

library(tidyverse)

df %>% 
  pivot_wider(id_cols = c(State, CT), names_from = Year, 
     values_from = Unit, values_fn = sum, values_fill = 0, names_sort = T) 

#> # A tibble: 14 × 10
#>    State CT    `2005` `2007` `2008` `2010` `2011` `2012` `2013` `2015`
#>    <chr> <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 TX    TXA       67      0      0      0     65      0      0      0
#>  2 TX    TXC        0      0      0      0      0      0     35      0
#>  3 TX    TXB       35      0     20      0      0     28      0      0
#>  4 TX    TXD        0     12      0      0      0      0      0     34
#>  5 TX    TXE        0      0      0     40      0      0      0      0
#>  6 CA    CAA       13     67      0      0     55      0     19      0
#>  7 CA    CAB        0      0      0     45      0      0      0      0
#>  8 CA    CAC        0      0      0      0     40     98      0     25
#>  9 CA    CAE       32      0      0      0      0      0      0      0
#> 10 CA    CAD        0      0     28      0      0      0      0      0
#> 11 WI    WID       57     51      0     34    118      0      0      0
#> 12 WI    WIA       95      0      0      0      0     40      0      0
#> 13 WI    WIB        0      0      0      0     24      0      0     90
#> 14 WI    WIE        0      0    140      0      0      0      0      0
  • Related