I have a nested list from reading a JSON that stores logging info from a video game. The time element of the list is a simple vector, while inputManagerStates and syncedProperties are lists that may contain 0 or more elements.
This is a follow-up on THIS question, where with some help, I managed to get the data into rectangular format. Unfortunately, I have a lot of such JSON files and unnest_wider
seems to run quite slowly.
The list:
test_list <-
list(list(time = 9.92405605316162, inputManagerStates = list(),
syncedProperties = list()), list(time = 9.9399995803833,
inputManagerStates = list(list(inputId = "InputY", buttonState = FALSE,
axisValue = 0), list(inputId = "InputX", buttonState = FALSE,
axisValue = 0.0501395985484123), list(inputId = "xPos",
buttonState = FALSE, axisValue = 5), list(inputId = "yPos",
buttonState = FALSE, axisValue = 0.0799999982118607),
list(inputId = "zPos", buttonState = FALSE, axisValue = 0),
list(inputId = "xRot", buttonState = FALSE, axisValue = 0),
list(inputId = "yRot", buttonState = FALSE, axisValue = -0.70664256811142),
list(inputId = "zRot", buttonState = FALSE, axisValue = 0),
list(inputId = "wRot", buttonState = FALSE, axisValue = 0.707570731639862)),
syncedProperties = list(list(name = "timeStamp", value = "97,2"))),
list(time = 9.95659446716309, inputManagerStates = list(list(
inputId = "InputY", buttonState = FALSE, axisValue = 0),
list(inputId = "InputX", buttonState = FALSE, axisValue = 0.0993990004062653),
list(inputId = "xPos", buttonState = FALSE, axisValue = 5),
list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0799999982118607),
list(inputId = "zPos", buttonState = FALSE, axisValue = 0),
list(inputId = "xRot", buttonState = FALSE, axisValue = 0),
list(inputId = "yRot", buttonState = FALSE, axisValue = -0.705721318721771),
list(inputId = "zRot", buttonState = FALSE, axisValue = 0),
list(inputId = "wRot", buttonState = FALSE, axisValue = 0.708489596843719)),
syncedProperties = list(list(name = "timeStamp", value = "97,21667"))),
list(time = 20.0626411437988, inputManagerStates = list(list(
inputId = "InputY", buttonState = FALSE, axisValue = 0.601816594600677),
list(inputId = "InputX", buttonState = FALSE, axisValue = 0),
list(inputId = "xPos", buttonState = FALSE, axisValue = -1.31777036190033),
list(inputId = "yPos", buttonState = FALSE, axisValue = 0.0800001174211502),
list(inputId = "zPos", buttonState = FALSE, axisValue = 6.08214092254639),
list(inputId = "xRot", buttonState = FALSE, axisValue = 0),
list(inputId = "yRot", buttonState = FALSE, axisValue = -0.391442984342575),
list(inputId = "zRot", buttonState = FALSE, axisValue = 0),
list(inputId = "wRot", buttonState = FALSE, axisValue = 0.920202374458313)),
syncedProperties = list(list(name = "timeStamp", value = "107,3167"),
list(name = "previousGameState", value = "1"), list(
name = "newGameState", value = "2"))))
Code I am using to rectangularize the list:
library(tidyverse)
output_df <-
test_list %>%
tibble::enframe(name = "frame", value = "value") %>%
tidyr::unnest_wider(value) %>%
tidyr::unnest(inputManagerStates, keep_empty = TRUE) %>%
tidyr::unnest(syncedProperties, keep_empty = TRUE) %>%
tidyr::unnest_wider(syncedProperties) %>%
tidyr::unnest_wider(inputManagerStates)
output_df
#> # A tibble: 46 x 7
#> frame time inputId buttonState axisValue name value
#> <int> <dbl> <chr> <lgl> <dbl> <chr> <chr>
#> 1 1 9.92 <NA> NA NA <NA> <NA>
#> 2 2 9.94 InputY FALSE 0 timeStamp 97,2
#> 3 2 9.94 InputX FALSE 0.0501 timeStamp 97,2
#> 4 2 9.94 xPos FALSE 5 timeStamp 97,2
#> 5 2 9.94 yPos FALSE 0.0800 timeStamp 97,2
#> 6 2 9.94 zPos FALSE 0 timeStamp 97,2
#> 7 2 9.94 xRot FALSE 0 timeStamp 97,2
#> 8 2 9.94 yRot FALSE -0.707 timeStamp 97,2
#> 9 2 9.94 zRot FALSE 0 timeStamp 97,2
#> 10 2 9.94 wRot FALSE 0.708 timeStamp 97,2
#> # ... with 36 more rows
Created on 2022-08-24 with reprex v2.0.2
For my data unnest
is fairly fast, but unnest_wider
is quite slow. The first unnest_wider(value)
can be easily written in base R - cbind(., do.call("rbind", .$value))
- and is much faster:
microbenchmark::microbenchmark(
unnest_wider =
test_list %>%
tibble::enframe(name = "frame", value = "value") %>%
tidyr::unnest_wider(value),
baser_r =
test_list %>%
tibble::enframe(name = "frame", value = "value") %>%
cbind(., do.call("rbind", .$value)) %>%
select(-value)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> unnest_wider 3.1446 3.34645 4.031113 3.63625 4.22770 10.5289 100 b
#> baser_r 1.4005 1.48225 1.770210 1.63475 1.86465 5.0407 100 a
Created on 2022-08-24 with reprex v2.0.2
I am looking for a way to replace %>% tidyr::unnest_wider(syncedProperties) %>% tidyr::unnest_wider(inputManagerStates)
with faster code but the cbind
solution doesn't work because of different number of rows.
EDIT: Think this may be possible with unnest::unnest()
but couldn't achieve the desired structure with it (while tidytable::unnest_wider.
currently supports only vectors).
CodePudding user response:
You could try rrapply
with how = "bind"
in the rrapply
-package (a revisit of base rapply
). This option can be efficient when unnesting replicated lists, such as inputManagerStates
:
library(rrapply)
getStates <- function(lst) {
rrapply(
lst,
condition = \(x, .xparents) "inputManagerStates" %in% .xparents,
how = "bind",
options = list(coldepth = 4, namecols = TRUE)
)
}
getStates(test_list) |>
head()
#> L1 L2 L3 inputId buttonState axisValue
#> 1 2 inputManagerStates 1 InputY FALSE 0.0000000
#> 2 2 inputManagerStates 2 InputX FALSE 0.0501396
#> 3 2 inputManagerStates 3 xPos FALSE 5.0000000
#> 4 2 inputManagerStates 4 yPos FALSE 0.0800000
#> 5 2 inputManagerStates 5 zPos FALSE 0.0000000
#> 6 2 inputManagerStates 6 xRot FALSE 0.0000000
microbenchmark::microbenchmark(
getStates(test_list)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> getStates(test_list) 518.573 556.7075 839.9312 587.677 675.444 18465.56 100
To get to the rectangular format in the question, a possible approach could be to first unnest the time
, inputManagerStates
and syncedProperties
lists individually and then merge them back together into a single data.frame. This already gives a ±10 times speedup compared to the approach in the question:
library(tidyr)
library(tibble)
library(rrapply)
unnest_rrapply <- function() {
## bind individual data.frames
states <- rrapply(
test_list,
condition = \(x, .xparents) "inputManagerStates" %in% .xparents,
how = "bind",
options = list(coldepth = 4, namecols = TRUE)
)
properties <- rrapply(
test_list,
condition = \(x, .xparents) "syncedProperties" %in% .xparents,
how = "bind",
options = list(coldepth = 4, namecols = TRUE)
)
times <- rrapply(
test_list,
condition = \(x, .xname) .xname == "time",
how = "bind",
options = list(namecols = TRUE)
)
## merge into single data.frame
out <- merge(times, properties[, -c(2, 3)], all = TRUE, by = "L1") |>
merge(states[, -c(2, 3)], all = TRUE, by = "L1")
return(out)
}
unnest_rrapply() |>
tibble::as_tibble()
#> # A tibble: 46 × 7
#> L1 time name value inputId buttonState axisValue
#> <chr> <dbl> <chr> <chr> <chr> <lgl> <dbl>
#> 1 1 9.92 <NA> <NA> <NA> NA NA
#> 2 2 9.94 timeStamp 97,2 InputY FALSE 0
#> 3 2 9.94 timeStamp 97,2 InputX FALSE 0.0501
#> 4 2 9.94 timeStamp 97,2 xPos FALSE 5
#> 5 2 9.94 timeStamp 97,2 yPos FALSE 0.0800
#> 6 2 9.94 timeStamp 97,2 zPos FALSE 0
#> 7 2 9.94 timeStamp 97,2 xRot FALSE 0
#> 8 2 9.94 timeStamp 97,2 yRot FALSE -0.707
#> 9 2 9.94 timeStamp 97,2 zRot FALSE 0
#> 10 2 9.94 timeStamp 97,2 wRot FALSE 0.708
#> # … with 36 more rows
#> # ℹ Use `print(n = ...)` to see more rows
unnest_tidyr <- function() {
test_list |>
enframe(name = "frame", value = "value") |>
unnest_wider(value) |>
unnest(inputManagerStates, keep_empty = TRUE) |>
unnest(syncedProperties, keep_empty = TRUE) |>
unnest_wider(syncedProperties) |>
unnest_wider(inputManagerStates)
}
microbenchmark::microbenchmark(
unnest_tidyr(),
unnest_rrapply()
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> unnest_tidyr() 17.993309 20.33476 23.13348 22.14034 24.767693 47.35227 100
#> unnest_rrapply() 2.354847 2.60518 2.90372 2.74223 2.888071 13.38620 100
NB: losing the merges at the end of unnest_rrapply()
and returning e.g. a list of data.frames (which is perhaps sufficient for OP's purposes) allows to further reduce the computation time by a factor of ±2.