I have a data.table as follows -
data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN",
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN",
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17",
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN",
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN",
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L,
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
I want to remove all the NaN
and Inf
values from this data.table.
date open high low close volume
1: 2021-11-24 NaN NaN NaN NaN 0
2: 2021-11-24 NaN NaN NaN NaN 0
3: 2021-11-26 0.43 0.43 0.43 0.43 2
4: 2021-11-24 0.17 0.17 0.17 0.17 10
5: 2021-11-26 0.19 0.19 0.19 0.19 75
6: 2021-11-24 0.15 0.15 0.15 0.15 1
7: 2021-11-24 NaN NaN NaN NaN 0
8: 2021-11-26 NaN NaN NaN NaN 0
9: 2021-11-26 NaN NaN NaN NaN 0
10: 2021-11-26 NaN NaN NaN NaN 0
11: 2021-11-26 NaN NaN NaN NaN 0
All the columns open
, high
, low
, close
are character types due to NaN
values.
Is there a quick way to remove NaNs directly in data.table
?
CodePudding user response:
Could you just us as.numeric
to convert?
result = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5]))
output:
date volume open high low close
1: 2021-11-26 2 0.43 0.43 0.43 0.43
2: 2021-11-24 10 0.17 0.17 0.17 0.17
3: 2021-11-26 75 0.19 0.19 0.19 0.19
4: 2021-11-24 1 0.15 0.15 0.15 0.15
CodePudding user response:
The optimal strategy would be, to get the index of rows with NaN
then filter out those indexes.
library(dplyr)
data$Row <- row.names(data)
rm_rw <- data[apply(data, 1,
function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row)
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
date open high low close volume
1: 2021-11-26 0.43 0.43 0.43 0.43 2
2: 2021-11-24 0.17 0.17 0.17 0.17 10
3: 2021-11-26 0.19 0.19 0.19 0.19 75
4: 2021-11-24 0.15 0.15 0.15 0.15 1
Update 1
Changed any(X== "NaN"))
to any(X== "NaN"|X== "Inf"))
so that Inf
can also be filtered out
CodePudding user response:
The NaN
created was quoted, so the columns were unnecessarily type changed to character
.
> str(data)
Classes ‘data.table’ and 'data.frame': 11 obs. of 6 variables:
$ date : chr "2021-11-24" "2021-11-24" "2021-11-26" "2021-11-24" ...
$ open : chr "NaN" "NaN" "0.43" "0.17" ...
$ high : chr "NaN" "NaN" "0.43" "0.17" ...
$ low : chr "NaN" "NaN" "0.43" "0.17" ...
$ close : chr "NaN" "NaN" "0.43" "0.17" ...
$ volume: int 0 0 2 10 75 1 0 0 0 0 ...
We may need to convert the type automatically and then use data.table
methods - loop over the columns other than the 'date' by specifying the .SDcols
, create the logical expression i.e. the column values are not NaN (!is.nan
) and (&
) is finite (is.finite
), Reduce
the logical vectors to a single vector with &
and subset the rows
library(data.table)
data <- type.convert(data, as.is = TRUE)
out <- data[data[, Reduce(`&`, lapply(.SD, function(x)
!is.nan(x) & is.finite(x))), .SDcols = -1]]
out
date open high low close volume
1: 2021-11-26 0.43 0.43 0.43 0.43 2
2: 2021-11-24 0.17 0.17 0.17 0.17 10
3: 2021-11-26 0.19 0.19 0.19 0.19 75
4: 2021-11-24 0.15 0.15 0.15 0.15 1
CodePudding user response:
One way would be to find the index of the rows containing NaN
:
unique(which(data == NaN, arr.ind=T)[,1])
[1] 1 2 7 8 9 10 11
And then set a logical condition to remove these rows:
data[!unique(which(data == NaN, arr.ind=T)[,1])]
date open high low close volume
1: 2021-11-26 0.43 0.43 0.43 0.43 2
2: 2021-11-24 0.17 0.17 0.17 0.17 10
3: 2021-11-26 0.19 0.19 0.19 0.19 75
4: 2021-11-24 0.15 0.15 0.15 0.15 1
Some benchmarks
Unit: microseconds
expr min lq mean median uq max neval cld
me 2466.653 2617.213 3262.990 2759.8645 3455.914 9236.749 100 ab
langtang 3590.978 3859.203 4651.293 4223.0400 5028.727 10117.352 100 b
akrun 742.534 842.931 1201.107 981.4145 1143.847 6101.451 100 a
paul 2970.088 3225.846 3707.531 3559.8195 3888.943 6502.238 100 b
Macosso 34435.574 36336.227 41126.294 38380.3570 41869.482 150544.220 100 c
data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN",
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN",
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17",
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN",
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN",
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L,
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
data = do.call("rbind", replicate(1000, data, simplify = FALSE))
library(dtplyr)
res = microbenchmark::microbenchmark(
me = data[!unique(which(data == NaN, arr.ind=T)[,1])],
langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),
akrun = data[data[, Reduce(`&`, lapply(.SD, function(x)
!is.nan(x) & is.finite(x))), .SDcols = -1]],
paul = data %>%
lazy_dt %>%
filter(across(2:5, ~ .x != "NaN")) %>%
as.data.table,
Macosso = {data$Row <- row.names(data);
rm_rw <- data[apply(data, 1,
function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row);
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
}
)
CodePudding user response:
A solution based on dtplyr
:
library(dtplyr)
library(dplyr)
library(data.table)
data <- structure(
list(date=c("2021-11-24","2021-11-24","2021-11-26",
"2021-11-24","2021-11-26","2021-11-24",
"2021-11-24","2021-11-26","2021-11-26",
"2021-11-26","2021-11-26"),
open=c("NaN","NaN","0.43","0.17","0.19","0.15",
"NaN","NaN","NaN","NaN","NaN"),
high=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN",
"NaN","NaN","NaN","NaN"),low=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),close=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),volume=c(0L,0L,2L,10L,75L,1L,0L,0L,0L,0L,0L)),row.names=c(NA,-11L),class=c("data.table","data.frame"))
data %>%
lazy_dt %>%
filter(across(2:5, ~ !.x %in% c("NaN","Inf"))) %>%
as.data.table
#> date open high low close volume
#> 1: 2021-11-26 0.43 0.43 0.43 0.43 2
#> 2: 2021-11-24 0.17 0.17 0.17 0.17 10
#> 3: 2021-11-26 0.19 0.19 0.19 0.19 75
#> 4: 2021-11-24 0.15 0.15 0.15 0.15 1
In case the NaN
and Inf
are not character type, as @akrun mentions in his answer, then the following is also a solution:
library(data.table)
# No need of this, if NaN e Inf are not strings
data <- type.convert(data, as.is = TRUE)
data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]
#> date open high low close volume
#> 1: 2021-11-26 0.43 0.43 0.43 0.43 2
#> 2: 2021-11-24 0.17 0.17 0.17 0.17 10
#> 3: 2021-11-26 0.19 0.19 0.19 0.19 75
#> 4: 2021-11-24 0.15 0.15 0.15 0.15 1