I have a fairly large data.table that comes from an SQL table.
All columns containing missing values in SQL are replaced by NULLs in the data.table so that these columns are actually lists containing values and also missing values.
I would like an efficient way to replace the NULLs with NAs and then convert the column (list) to a real data.table column
This is an example to reproduce my case :
library(data.table)
n = 10^6
l1 = as.list(rnorm(n, 10, 25))
l2 = as.list(rnorm(n, 0, 200))
l3 = as.list(rnorm(n))
df = data.table(a = runif(n),
b = l1,
c = l2,
d = rnorm(n, 88, 0.5),
e = l3
)
# create an index vector to set to NULL
id1 = sample(1:n, 0.26*n)
id2 = sample(1:n, 0.60*n)
id3 = sample(1:n, 0.09*n)
# set to NULL
df$b[id1] = list(NULL)
df$c[id2] = list(NULL)
df$e[id3] = list(NULL)
This is what I have done but it's a bit too long :
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
type = data.frame(type = sapply(df, class))
col = names(df)[which(type$type == "list")]
# ----------- FIRST WAY -----------------------------------------------------------------------
system.time(
df[, (col) := lapply(.SD, function(i) unlist(lapply(i, function(x) ifelse(is.null(x), NA, x)))), .SDcols = col]
)
# ----------- SECOND WAY (a little bit faster) ---------------------------
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
Why the 1st solution is slower than second ? Anybody has a better way ?
CodePudding user response:
We may use set
here
library(data.table)
df1 <- copy(df)
system.time({
for(nm in col) {
i1 <- which(lengths(df1[[nm]]) == 0)
set(df1, i = i1, j = nm, value = list(NA))
df1[[nm]] <- unlist(df1[[nm]])
}
})
# user system elapsed
# 0.158 0.004 0.161
Compared with OP's second method
system.time(
for (i in col) {
df[, eval(i) := unlist(lapply(get(i), function(x) ifelse(is.null(x), NA, x)))]
}
)
# user system elapsed
# 5.618 0.157 5.756
-checking output
> all.equal(df, df1)
[1] TRUE
CodePudding user response:
One solution based on lengths
function:
cols = which(sapply(df, is.list))
df[, (cols) := lapply(.SD, \(x) {x[lengths(x)==0L] = NA; as.numeric(x)}), .SDcols=cols]