Transpose a wide dataset into long with multiple steps-CodePudding

I have the following dataset which has firm identifiers in column id_isin and a series of covariates (features) in the column covariate. The dataset comes in wide format as each covariate is observed at multiple time periods. These are organized in columns (y2010 to y2020). I report the input data called forSO below:

library(data.table)

forSO = fread("~/Desktop/forSO.csv")
forSO
#>         id_isin                     covariate      y2010      y2011      y2012
#> 1: ZAE000255915  NET INC BEFORE_EXTRA/PFDDIVS  8118000.0  9674000.0 8.3930e 06
#> 2: ZAE000255915              OPERATING INCOME 11756000.0 14134000.0 1.2266e 07
#> 3: ZAE000255915              RETURN ON ASSETS        2.5        2.3 1.7800e 00
#> 4: ZAE000198289 NET INC BEFORE EXTRA/PFD DIVS         NA         NA         NA
#> 5: ZAE000198289              OPERATING INCOME         NA         NA         NA
#> 6: ZAE000198289              RETURN ON ASSETS         NA         NA         NA
#>         y2013      y2014    y2015       y2016       y2017       y2018
#> 1: 1.1981e 07 1.3216e 07 14331000 14708000.00 13823000.00 13917000.00
#> 2: 1.7975e 07 1.9921e 07 21227000 22210000.00 21329000.00 21772000.00
#> 3: 1.8400e 00 1.9300e 00        2        2.06        2.01        1.91
#> 4:         NA         NA    40811   559094.00   786806.00   814462.00
#> 5:         NA         NA    48190   233141.00   299230.00   307252.00
#> 6:         NA         NA       NA       10.84       12.86       11.76
#>          y2019       y2020
#> 1: 14256000.00  5880000.00
#> 2: 21820000.00 10765000.00
#> 3:        1.87        1.09
#> 4:   920734.00   485423.00
#> 5:   368575.00   326465.00
#> 6:       11.24        5.57

^{Created on 2021-12-21 by the reprex package (v2.0.1)}

I want to transpose the dataset into a panel data structure which looks like the following:

library(data.table)

output = fread("~/Desktop/minimal.csv")
output
#>          id_isin year NET INC BEFORE_EXTRA/PFDDIVS OPERATING INCOME
#>  1: ZAE000255915 2010                      8118000         11756000
#>  2: ZAE000255915 2011                      9674000         14134000
#>  3: ZAE000255915 2012                      8393000         12266000
#>  4: ZAE000255915 2013                     11981000         17975000
#>  5: ZAE000255915 2014                     13216000         19921000
#>  6: ZAE000255915 2015                     14331000         21227000
#>  7: ZAE000255915 2016                     14708000         22210000
#>  8: ZAE000255915 2017                     13823000         21329000
#>  9: ZAE000255915 2018                     13917000         21772000
#> 10: ZAE000255915 2019                     14256000         21820000
#> 11: ZAE000255915 2020                      5880000         10765000
#> 12: ZAE000198289 2010                           NA               NA
#> 13: ZAE000198289 2011                           NA               NA
#> 14: ZAE000198289 2012                           NA               NA
#> 15: ZAE000198289 2013                           NA               NA
#> 16: ZAE000198289 2014                           NA               NA
#> 17: ZAE000198289 2015                        40811            48190
#> 18: ZAE000198289 2016                       559094           233141
#> 19: ZAE000198289 2017                       786806           299230
#> 20: ZAE000198289 2018                       814462           307252
#> 21: ZAE000198289 2019                       920734           368575
#> 22: ZAE000198289 2020                       485423           326465
#>          id_isin year NET INC BEFORE_EXTRA/PFDDIVS OPERATING INCOME
#>     RETURN ON ASSETS
#>  1:             2.50
#>  2:             2.30
#>  3:             1.78
#>  4:             1.84
#>  5:             1.93
#>  6:             2.00
#>  7:             2.06
#>  8:             2.01
#>  9:             1.91
#> 10:             1.87
#> 11:             1.09
#> 12:               NA
#> 13:               NA
#> 14:               NA
#> 15:               NA
#> 16:               NA
#> 17:               NA
#> 18:            10.84
#> 19:            12.86
#> 20:            11.76
#> 21:            11.24
#> 22:             5.57
#>     RETURN ON ASSETS

^{Created on 2021-12-21 by the reprex package (v2.0.1)}

Please find both datasets to import in R below.

Any suggestion is appreciated!

Input Dataset

structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289"), 
               covariate = c("NET INC BEFORE_EXTRA/PFDDIVS", 
                             "OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS", 
                             "OPERATING INCOME", "RETURN ON ASSETS"), 
               y2010 = c(8118000, 11756000, 2.5, NA, NA, NA), 
               y2011 = c(9674000, 14134000, 2.3, NA, NA, NA), 
               y2012 = c(8393000, 12266000, 1.78, NA, NA, NA), 
               y2013 = c(11981000, 17975000, 1.84, NA, NA, NA), 
               y2014 = c(13216000, 19921000, 1.93, NA, NA, NA), 
               y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA), 
               y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84), 
               y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86), 
               y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76), 
               y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24), 
               y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)), 
          row.names = c(NA, -6L), class = c("data.table", "data.frame" ))

Desired Outcome

structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289"), 
               year = c(2010L, 
                        2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L, 2019L, 
                        2020L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 
                        2018L, 2019L, 2020L), 
               `NET INC BEFORE_EXTRA/PFDDIVS` = c(8118000L, 
                                                  9674000L, 8393000L, 11981000L, 13216000L, 14331000L, 14708000L, 
                                                  13823000L, 13917000L, 14256000L, 5880000L, NA, NA, NA, NA, NA, 
                                                  40811L, 559094L, 786806L, 814462L, 920734L, 485423L), 
               `OPERATING INCOME` = c(11756000L, 
                                      14134000L, 12266000L, 17975000L, 19921000L, 21227000L, 22210000L, 
                                      21329000L, 21772000L, 21820000L, 10765000L, NA, NA, NA, NA, NA, 
                                      48190L, 233141L, 299230L, 307252L, 368575L, 326465L), 
               `RETURN ON ASSETS` = c(2.5, 
                                      2.3, 1.78, 1.84, 1.93, 2, 2.06, 2.01, 1.91, 1.87, 1.09, NA, NA, 
                                      NA, NA, NA, NA, 10.84, 12.86, 11.76, 11.24, 5.57)), 
          row.names = c(NA, -22L), class = c("data.table", "data.frame"))

CodePudding user response：

This should get you started:

x <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289"), 
               covariate = c("NET INC BEFORE_EXTRA/PFDDIVS", 
                             "OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS", 
                             "OPERATING INCOME", "RETURN ON ASSETS"), 
               y2010 = c(8118000, 11756000, 2.5, NA, NA, NA), 
               y2011 = c(9674000, 14134000, 2.3, NA, NA, NA), 
               y2012 = c(8393000, 12266000, 1.78, NA, NA, NA), 
               y2013 = c(11981000, 17975000, 1.84, NA, NA, NA), 
               y2014 = c(13216000, 19921000, 1.93, NA, NA, NA), 
               y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA), 
               y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84), 
               y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86), 
               y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76), 
               y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24), 
               y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)), 
          row.names = c(NA, -6L), class = c("data.table", "data.frame" ))
  
    library(tidyr)  
    x %>% 
      pivot_longer(-c(id_isin, covariate) ) %>%
      pivot_wider(names_from = "covariate") %>%
      mutate(year = as.numeric(stringr::str_remove(name, "y")))%>%
      select(id_isin, year, `NET INC BEFORE_EXTRA/PFDDIVS`, `OPERATING INCOME`)

Which gives us the following:

    # A tibble: 22 × 4
   id_isin       year `NET INC BEFORE_EXTRA/PFDDIVS` `OPERATING INCOME`
   <chr>        <dbl>                          <dbl>              <dbl>
 1 ZAE000255915  2010                        8118000           11756000
 2 ZAE000255915  2011                        9674000           14134000
 3 ZAE000255915  2012                        8393000           12266000
 4 ZAE000255915  2013                       11981000           17975000

CodePudding user response：

This should do the trick:

library(tidyverse)
df <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289"), 
               covariate = c("NET INC BEFORE_EXTRA/PFDDIVS", 
                             "OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS", 
                             "OPERATING INCOME", "RETURN ON ASSETS"), 
               y2010 = c(8118000, 11756000, 2.5, NA, NA, NA), 
               y2011 = c(9674000, 14134000, 2.3, NA, NA, NA), 
               y2012 = c(8393000, 12266000, 1.78, NA, NA, NA), 
               y2013 = c(11981000, 17975000, 1.84, NA, NA, NA), 
               y2014 = c(13216000, 19921000, 1.93, NA, NA, NA), 
               y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA), 
               y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84), 
               y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86), 
               y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76), 
               y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24), 
               y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)), 
          row.names = c(NA, -6L), class = c("data.table", "data.frame" ))


df %>% 
  pivot_longer(cols = c(paste("y",2010:2020,sep = "")), names_to = "year", values_to = "HHA") %>% 
  pivot_wider(names_from = "covariate", values_from = "HHA") %>% 
  mutate(`NET INC BEFORE_EXTRA/PFDDIVS` = coalesce(`NET INC BEFORE_EXTRA/PFDDIVS`,`NET INC BEFORE EXTRA/PFD DIVS`),
         year = str_remove(year, "y")) %>% 
  select(-`NET INC BEFORE EXTRA/PFD DIVS`)

Output:

# A tibble: 22 x 5
   id_isin      year  `NET INC BEFORE_EXTRA/PFDDIVS` `OPERATING INCOME` `RETURN ON ASSETS`
   <chr>        <chr>                          <dbl>              <dbl>              <dbl>
 1 ZAE000255915 2010                         8118000           11756000               2.5 
 2 ZAE000255915 2011                         9674000           14134000               2.3 
 3 ZAE000255915 2012                         8393000           12266000               1.78
 4 ZAE000255915 2013                        11981000           17975000               1.84
 5 ZAE000255915 2014                        13216000           19921000               1.93
 6 ZAE000255915 2015                        14331000           21227000               2   
 7 ZAE000255915 2016                        14708000           22210000               2.06
 8 ZAE000255915 2017                        13823000           21329000               2.01
 9 ZAE000255915 2018                        13917000           21772000               1.91
10 ZAE000255915 2019                        14256000           21820000               1.87
# ... with 12 more rows

CodePudding user response：

If you prefer data.table functions/syntax, perhaps:

library(data.table)
x <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915", 
                                "ZAE000198289", "ZAE000198289", "ZAE000198289"), 
                    covariate = c("NET INC BEFORE_EXTRA/PFDDIVS", 
                                  "OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS", 
                                  "OPERATING INCOME", "RETURN ON ASSETS"), 
                    y2010 = c(8118000, 11756000, 2.5, NA, NA, NA), 
                    y2011 = c(9674000, 14134000, 2.3, NA, NA, NA), 
                    y2012 = c(8393000, 12266000, 1.78, NA, NA, NA), 
                    y2013 = c(11981000, 17975000, 1.84, NA, NA, NA), 
                    y2014 = c(13216000, 19921000, 1.93, NA, NA, NA), 
                    y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA), 
                    y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84), 
                    y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86), 
                    y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76), 
                    y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24), 
                    y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)), 
               row.names = c(NA, -6L), class = c("data.table", "data.frame" ))

x.m2 <- melt(x, id.vars = c("id_isin", "covariate"))
#> Warning in melt.data.table(x, id.vars = c("id_isin", "covariate")):
#> 'measure.vars' [y2010, y2011, y2012, y2013, ...] are not all of the same type.
#> By order of hierarchy, the molten data value column will be of type 'double'.
#> All measure variables not of type 'double' will be coerced too. Check DETAILS
#> in ?melt.data.table for more on coercion.
x.m3 <- dcast(x.m2, formula = id_isin   variable ~ covariate, value.var = "value")
x.m3$year <- as.integer(gsub(x = x.m3$variable, pattern = "y", replacement = ""))
x.m4 <- x.m3[,variable := NULL]

x.m5 <- x.m4[,`NET INC BEFORE_EXTRA/PFDDIVS` := .(fcoalesce(`NET INC BEFORE_EXTRA/PFDDIVS`,`NET INC BEFORE EXTRA/PFD DIVS`))]
x.m6 <- x.m5[,`NET INC BEFORE EXTRA/PFD DIVS` := NULL]
x.m6

outcome <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289", 
                           "ZAE000198289", "ZAE000198289", "ZAE000198289"), 
               year = c(2010L, 
                        2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L, 2019L, 
                        2020L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 
                        2018L, 2019L, 2020L), 
               `NET INC BEFORE_EXTRA/PFDDIVS` = c(8118000L, 
                                                  9674000L, 8393000L, 11981000L, 13216000L, 14331000L, 14708000L, 
                                                  13823000L, 13917000L, 14256000L, 5880000L, NA, NA, NA, NA, NA, 
                                                  40811L, 559094L, 786806L, 814462L, 920734L, 485423L), 
               `OPERATING INCOME` = c(11756000L, 
                                      14134000L, 12266000L, 17975000L, 19921000L, 21227000L, 22210000L, 
                                      21329000L, 21772000L, 21820000L, 10765000L, NA, NA, NA, NA, NA, 
                                      48190L, 233141L, 299230L, 307252L, 368575L, 326465L), 
               `RETURN ON ASSETS` = c(2.5, 
                                      2.3, 1.78, 1.84, 1.93, 2, 2.06, 2.01, 1.91, 1.87, 1.09, NA, NA, 
                                      NA, NA, NA, NA, 10.84, 12.86, 11.76, 11.24, 5.57)), 
          row.names = c(NA, -22L), class = c("data.table", "data.frame"))
dplyr::all_equal(x.m6, outcome)
#> [1] "- Different types for column `NET INC BEFORE_EXTRA/PFDDIVS`: double vs integer\n- Different types for column `OPERATING INCOME`: double vs integer\n"

^{Created on 2021-12-21 by the reprex package (v2.0.1)}

CodePudding user response：

concise data.table approach (x is your input structure, above):

dcast(melt(
  x[,covariate:=fifelse(
    grepl("^NET",covariate),
    "NET INC BEFORE_EXTRA/PFDDIVS",
    covariate)],
  ,id=c(1,2),variable.name = "year")[
    ,year:=as.integer(gsub("y","",year))],
  id_isin year~covariate,value.var = "value"
)