Here's the issue in a nutshell. sessionInfo
is listed below the problem description to assist with troubleshooting (data.table_1.14.2)
When using rbindlist()
or even do.call(rbind, l))
, I'm unable to create an output data.frame/data.table
. Sometimes able to get a strange matrix, which seems to be only a vector of a new column I've calculated, but the rest of the data.table
is lost. If I build data frames up by hand, I'm able to do so successfully, but not programmatically. I've used similar logic at work for about 9 months, and for some reason this is not working. Has anyone else come across this?
I'm able to scope into the lapply()
and produce individual data frames as desired (according to the split logic, in this case by location), but running the entire rbind
in any form is failing.
Thanks in advance. There seems to be a ticket for data.table
's dev
branch here, and I've logged my observation.
# The most troubling
> rbindlist(list(1:3, 4:6))
Error in rbindlist(list(1:3, 4:6)) :
Item 1 of input is not a data.frame, data.table or list
# The main issue I'm having
> temp_data <- structure(list(location_id = c(75L, 75L, 75L, 75L, 80L, 80L,
80L, 80L), date = structure(c(19144L, 19145L, 19146L, 19147L,
19144L, 19145L, 19146L, 19147L), class = c("IDate", "Date")),
cases_cum = c(4289988, 4293027, 4295818, 4298654, 29570762,
29595892, 29621064, 29641606), population = c(8916185.49099959,
8916185.49099959, 8916185.49099959, 8916185.49099959, 66204314.9643178,
66204314.9643178, 66204314.9643178, 66204314.9643178), location_name = c("Austria",
"Austria", "Austria", "Austria", "France", "France", "France",
"France")), class = c("data.table", "data.frame"), row.names = c(NA,
-8L), sorted = "location_id")
> str(temp_data)
Classes 'data.table' and 'data.frame': 8 obs. of 5 variables:
$ location_id : int 75 75 75 75 80 80 80 80
$ date : IDate, format: "2022-06-01" "2022-06-02" "2022-06-03" "2022-06-04" ...
$ cases_cum : num 4289988 4293027 4295818 4298654 29570762 ...
$ population : num 8916185 8916185 8916185 8916185 66204315 ...
$ location_name: chr "Austria" "Austria" "Austria" "Austria" ...
- attr(*, "sorted")= chr "location_id"
> test1 <-
rbindlist(
lapply(split(temp_data, temp_data$location_id), function(x) {
x <- x[order(x$date),]
x$cases_daily <- c(NA,diff(x$cases_cum))
}))
Error in rbindlist(lapply(split(temp_data, temp_data$location_id), function(x) { :
Item 1 of input is not a data.frame, data.table or list
> test1
Error: object 'test1' not found
> test2 <- do.call(rbind,
lapply(split(temp_data, temp_data$location_id), function(x) {
x <- x[order(x$date)]
x$cases_daily <- c(NA,diff(x$cases_cum))
})
)
> test2
[,1] [,2] [,3] [,4]
75 NA 3039 2791 2836
80 NA 25130 25172 20542
# Scoping in works fine
> x <- temp_data[location_id == 80]
> x <- x[order(x$date)]
> x$cases_daily <- c(NA,diff(x$cases_cum))
> x
location_id date cases_cum population location_name cases_daily
1: 80 2022-06-01 29570762 66204315 France NA
2: 80 2022-06-02 29595892 66204315 France 25130
3: 80 2022-06-03 29621064 66204315 France 25172
4: 80 2022-06-04 29641606 66204315 France 20542
> y <- temp_data[location_id == 75]
> y <- y[order(y$date)]
> y$cases_daily <- c(NA,diff(y$cases_cum))
> y
location_id date cases_cum population location_name cases_daily
1: 75 2022-06-01 4289988 8916185 Austria NA
2: 75 2022-06-02 4293027 8916185 Austria 3039
3: 75 2022-06-03 4295818 8916185 Austria 2791
4: 75 2022-06-04 4298654 8916185 Austria 2836
> rbindlist(list(x,y))
location_id date cases_cum population location_name cases_daily
1: 80 2022-06-01 29570762 66204315 France NA
2: 80 2022-06-02 29595892 66204315 France 25130
3: 80 2022-06-03 29621064 66204315 France 25172
4: 80 2022-06-04 29641606 66204315 France 20542
5: 75 2022-06-01 4289988 8916185 Austria NA
6: 75 2022-06-02 4293027 8916185 Austria 3039
7: 75 2022-06-03 4295818 8916185 Austria 2791
8: 75 2022-06-04 4298654 8916185 Austria 2836
> test3 <- lapply(split(temp_data, temp_data$location_id), function(x) {
x <- x[order(x$date)]
x$cases_daily <- c(NA,diff(x$cases_cum))
})
> test3
$`75`
[1] NA 3039 2791 2836
$`80`
[1] NA 25130 25172 20542
> sessionInfo()
R version 4.1.3 (2022-03-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 10 (buster)
Matrix products: default
BLAS/LAPACK: /opt/intel/compilers_and_libraries_2020.4.304/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.8 purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.6
[8] ggplot2_3.3.5 tidyverse_1.3.1 data.table_1.14.2
CodePudding user response:
You manipulated x$cases_daily
but forgot to return the whole x
. Thus, currently the function returns the last manipulated object.
test1 <-
data.table::rbindlist(
lapply(split(temp_data, temp_data$location_id), function(x) {
x <- x[order(x$date),]
x$cases_daily <- c(NA,diff(x$cases_cum))
x ## add this!
}))
head(test1)
# location_id date cases_cum population location_name cases_daily
# 1: 75 2022-06-01 4289988 8916185 Austria NA
# 2: 75 2022-06-02 4293027 8916185 Austria 3039
# 3: 75 2022-06-03 4295818 8916185 Austria 2791
# 4: 75 2022-06-04 4298654 8916185 Austria 2836
# 5: 80 2022-06-01 29570762 66204315 France NA
# 6: 80 2022-06-02 29595892 66204315 France 25130
Notice, that you may use by
instead of lapply(split(.))
.
data.table::rbindlist(
by(temp_data, temp_data$location_id, function(x) {
x <- x[order(x$date), ]
x$cases_daily <- c(NA, diff(x$cases_cum))
x
}))