Home > Software engineering >  Nested list to dataframe using tidyverse: faster than tidyr unnest_wider
Nested list to dataframe using tidyverse: faster than tidyr unnest_wider

Time:08-26

I have a nested list from reading a JSON that stores logging info from a video game. The time element of the list is a simple vector, while inputManagerStates and syncedProperties are lists that may contain 0 or more elements.

This is a follow-up on THIS question, where with some help, I managed to get the data into rectangular format. Unfortunately, I have a lot of such JSON files and unnest_wider seems to run quite slowly.

The list:

test_list <- 
  list(list(time = 9.92405605316162, inputManagerStates = list(), 
syncedProperties = list()), list(time = 9.9399995803833, 
inputManagerStates = list(list(inputId = "InputY", buttonState = FALSE, 
    axisValue = 0), list(inputId = "InputX", buttonState = FALSE, 
    axisValue = 0.0501395985484123), list(inputId = "xPos", 
    buttonState = FALSE, axisValue = 5), list(inputId = "yPos", 
    buttonState = FALSE, axisValue = 0.0799999982118607), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.70664256811142), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.707570731639862)), 
syncedProperties = list(list(name = "timeStamp", value = "97,2"))), 
list(time = 9.95659446716309, inputManagerStates = list(list(
    inputId = "InputY", buttonState = FALSE, axisValue = 0), 
    list(inputId = "InputX", buttonState = FALSE, axisValue = 0.0993990004062653), 
    list(inputId = "xPos", buttonState = FALSE, axisValue = 5), 
    list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0799999982118607), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.705721318721771), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.708489596843719)), 
    syncedProperties = list(list(name = "timeStamp", value = "97,21667"))), 
list(time = 20.0626411437988, inputManagerStates = list(list(
    inputId = "InputY", buttonState = FALSE, axisValue = 0.601816594600677), 
    list(inputId = "InputX", buttonState = FALSE, axisValue = 0), 
    list(inputId = "xPos", buttonState = FALSE, axisValue = -1.31777036190033), 
    list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0800001174211502), 
    list(inputId = "zPos", buttonState = FALSE, axisValue = 6.08214092254639), 
    list(inputId = "xRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "yRot", buttonState = FALSE, axisValue = -0.391442984342575), 
    list(inputId = "zRot", buttonState = FALSE, axisValue = 0), 
    list(inputId = "wRot", buttonState = FALSE, axisValue = 0.920202374458313)), 
    syncedProperties = list(list(name = "timeStamp", value = "107,3167"), 
        list(name = "previousGameState", value = "1"), list(
            name = "newGameState", value = "2"))))

Code I am using to rectangularize the list:

library(tidyverse)  

output_df <- 
  test_list %>% 
  tibble::enframe(name = "frame", value = "value") %>% 
  tidyr::unnest_wider(value) %>%
  tidyr::unnest(inputManagerStates, keep_empty = TRUE) %>%
  tidyr::unnest(syncedProperties, keep_empty = TRUE) %>%
  tidyr::unnest_wider(syncedProperties) %>%
  tidyr::unnest_wider(inputManagerStates)

output_df
#> # A tibble: 46 x 7
#>    frame  time inputId buttonState axisValue name      value
#>    <int> <dbl> <chr>   <lgl>           <dbl> <chr>     <chr>
#>  1     1  9.92 <NA>    NA            NA      <NA>      <NA> 
#>  2     2  9.94 InputY  FALSE          0      timeStamp 97,2 
#>  3     2  9.94 InputX  FALSE          0.0501 timeStamp 97,2 
#>  4     2  9.94 xPos    FALSE          5      timeStamp 97,2 
#>  5     2  9.94 yPos    FALSE          0.0800 timeStamp 97,2 
#>  6     2  9.94 zPos    FALSE          0      timeStamp 97,2 
#>  7     2  9.94 xRot    FALSE          0      timeStamp 97,2 
#>  8     2  9.94 yRot    FALSE         -0.707  timeStamp 97,2 
#>  9     2  9.94 zRot    FALSE          0      timeStamp 97,2 
#> 10     2  9.94 wRot    FALSE          0.708  timeStamp 97,2 
#> # ... with 36 more rows

Created on 2022-08-24 with reprex v2.0.2

For my data unnest is fairly fast, but unnest_wider is quite slow. The first unnest_wider(value) can be easily written in base R - cbind(., do.call("rbind", .$value)) - and is much faster:

microbenchmark::microbenchmark(
  unnest_wider =  
    test_list %>% 
    tibble::enframe(name = "frame", value = "value") %>% 
    tidyr::unnest_wider(value), 
  baser_r =  
    test_list %>% 
    tibble::enframe(name = "frame", value = "value") %>% 
    cbind(., do.call("rbind", .$value)) %>%
    select(-value)
)
#> Unit: milliseconds
#>          expr    min      lq     mean  median      uq     max neval cld
#>  unnest_wider 3.1446 3.34645 4.031113 3.63625 4.22770 10.5289   100   b
#>       baser_r 1.4005 1.48225 1.770210 1.63475 1.86465  5.0407   100  a

Created on 2022-08-24 with reprex v2.0.2

I am looking for a way to replace %>% tidyr::unnest_wider(syncedProperties) %>% tidyr::unnest_wider(inputManagerStates) with faster code but the cbind solution doesn't work because of different number of rows.

EDIT: Think this may be possible with unnest::unnest() but couldn't achieve the desired structure with it (while tidytable::unnest_wider. currently supports only vectors).

CodePudding user response:

You could try rrapplywith how = "bind" in the rrapply-package (a revisit of base rapply). This option can be efficient when unnesting replicated lists, such as inputManagerStates:

library(rrapply)

getStates <- function(lst) {
  rrapply(
    lst, 
    condition = \(x, .xparents) "inputManagerStates" %in% .xparents, 
    how = "bind", 
    options = list(coldepth = 4, namecols = TRUE)
  )
}

getStates(test_list) |>
  head()
#>   L1                 L2 L3 inputId buttonState axisValue
#> 1  2 inputManagerStates  1  InputY       FALSE 0.0000000
#> 2  2 inputManagerStates  2  InputX       FALSE 0.0501396
#> 3  2 inputManagerStates  3    xPos       FALSE 5.0000000
#> 4  2 inputManagerStates  4    yPos       FALSE 0.0800000
#> 5  2 inputManagerStates  5    zPos       FALSE 0.0000000
#> 6  2 inputManagerStates  6    xRot       FALSE 0.0000000

microbenchmark::microbenchmark(
  getStates(test_list)
)
#> Unit: microseconds
#>                  expr     min       lq     mean  median      uq      max neval
#>  getStates(test_list) 518.573 556.7075 839.9312 587.677 675.444 18465.56   100

To get to the rectangular format in the question, a possible approach could be to first unnest the time, inputManagerStates and syncedProperties lists individually and then merge them back together into a single data.frame. This already gives a ±10 times speedup compared to the approach in the question:

library(tidyr)
library(tibble)
library(rrapply)

unnest_rrapply <- function() {
  
  ## bind individual data.frames
  states <- rrapply(
    test_list, 
    condition = \(x, .xparents) "inputManagerStates" %in% .xparents, 
    how = "bind", 
    options = list(coldepth = 4, namecols = TRUE)
  )
  
  properties <- rrapply(
    test_list, 
    condition = \(x, .xparents) "syncedProperties" %in% .xparents, 
    how = "bind", 
    options = list(coldepth = 4, namecols = TRUE)
  )
  
  times <- rrapply(
    test_list,
    condition = \(x, .xname) .xname == "time",
    how = "bind",
    options = list(namecols = TRUE)
  )
  
  ## merge into single data.frame
  out <- merge(times, properties[, -c(2, 3)], all = TRUE, by = "L1") |>
    merge(states[, -c(2, 3)], all = TRUE, by = "L1")
  
  return(out)
  
}

unnest_rrapply() |>
  tibble::as_tibble()
#> # A tibble: 46 × 7
#>    L1     time name      value inputId buttonState axisValue
#>    <chr> <dbl> <chr>     <chr> <chr>   <lgl>           <dbl>
#>  1 1      9.92 <NA>      <NA>  <NA>    NA            NA     
#>  2 2      9.94 timeStamp 97,2  InputY  FALSE          0     
#>  3 2      9.94 timeStamp 97,2  InputX  FALSE          0.0501
#>  4 2      9.94 timeStamp 97,2  xPos    FALSE          5     
#>  5 2      9.94 timeStamp 97,2  yPos    FALSE          0.0800
#>  6 2      9.94 timeStamp 97,2  zPos    FALSE          0     
#>  7 2      9.94 timeStamp 97,2  xRot    FALSE          0     
#>  8 2      9.94 timeStamp 97,2  yRot    FALSE         -0.707 
#>  9 2      9.94 timeStamp 97,2  zRot    FALSE          0     
#> 10 2      9.94 timeStamp 97,2  wRot    FALSE          0.708 
#> # … with 36 more rows
#> # ℹ Use `print(n = ...)` to see more rows

unnest_tidyr <- function() {
  
  test_list |>
    enframe(name = "frame", value = "value") |>
    unnest_wider(value) |>
    unnest(inputManagerStates, keep_empty = TRUE) |>
    unnest(syncedProperties, keep_empty = TRUE) |>
    unnest_wider(syncedProperties) |>
    unnest_wider(inputManagerStates)
  
}

microbenchmark::microbenchmark(
  unnest_tidyr(),
  unnest_rrapply()
)
#> Unit: milliseconds
#>              expr       min       lq     mean   median        uq      max neval
#>    unnest_tidyr() 17.993309 20.33476 23.13348 22.14034 24.767693 47.35227   100
#>  unnest_rrapply()  2.354847  2.60518  2.90372  2.74223  2.888071 13.38620   100

NB: losing the merges at the end of unnest_rrapply() and returning e.g. a list of data.frames (which is perhaps sufficient for OP's purposes) allows to further reduce the computation time by a factor of ±2.

  • Related