How to remove NaN and Inf values from data.table in R-CodePudding

I have a data.table as follows -

data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN", 
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN", 
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17", 
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN", 
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", 
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L, 
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))

I want to remove all the NaN and Inf values from this data.table.

     date      open high  low close volume
 1: 2021-11-24  NaN  NaN  NaN   NaN      0
 2: 2021-11-24  NaN  NaN  NaN   NaN      0
 3: 2021-11-26 0.43 0.43 0.43  0.43      2
 4: 2021-11-24 0.17 0.17 0.17  0.17     10
 5: 2021-11-26 0.19 0.19 0.19  0.19     75
 6: 2021-11-24 0.15 0.15 0.15  0.15      1
 7: 2021-11-24  NaN  NaN  NaN   NaN      0
 8: 2021-11-26  NaN  NaN  NaN   NaN      0
 9: 2021-11-26  NaN  NaN  NaN   NaN      0
10: 2021-11-26  NaN  NaN  NaN   NaN      0
11: 2021-11-26  NaN  NaN  NaN   NaN      0

All the columns open, high, low, close are character types due to NaN values.

Is there a quick way to remove NaNs directly in data.table?

CodePudding user response：

Could you just us as.numeric to convert?

result = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5]))

output:

         date volume open high  low close
1: 2021-11-26      2 0.43 0.43 0.43  0.43
2: 2021-11-24     10 0.17 0.17 0.17  0.17
3: 2021-11-26     75 0.19 0.19 0.19  0.19
4: 2021-11-24      1 0.15 0.15 0.15  0.15

CodePudding user response：

The optimal strategy would be, to get the index of rows with NaN then filter out those indexes.

library(dplyr)

data$Row <- row.names(data)
rm_rw <- data[apply(data, 1, 
                    function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row)
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)

         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

Update 1
Changed any(X== "NaN")) to any(X== "NaN"|X== "Inf")) so that Inf can also be filtered out

CodePudding user response：

The NaN created was quoted, so the columns were unnecessarily type changed to character.

> str(data)
Classes ‘data.table’ and 'data.frame':  11 obs. of  6 variables:
 $ date  : chr  "2021-11-24" "2021-11-24" "2021-11-26" "2021-11-24" ...
 $ open  : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ high  : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ low   : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ close : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ volume: int  0 0 2 10 75 1 0 0 0 0 ...

We may need to convert the type automatically and then use data.table methods - loop over the columns other than the 'date' by specifying the .SDcols, create the logical expression i.e. the column values are not NaN (!is.nan) and (&) is finite (is.finite), Reduce the logical vectors to a single vector with & and subset the rows

library(data.table)
data <- type.convert(data, as.is = TRUE)
out <- data[data[, Reduce(`&`, lapply(.SD, function(x)
     !is.nan(x) & is.finite(x))), .SDcols = -1]]
out
         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

CodePudding user response：

One way would be to find the index of the rows containing NaN:

unique(which(data == NaN, arr.ind=T)[,1])

[1]  1  2  7  8  9 10 11

And then set a logical condition to remove these rows:

data[!unique(which(data == NaN, arr.ind=T)[,1])]

         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

Some benchmarks

Unit: microseconds
     expr       min        lq      mean     median        uq        max neval cld
      me  2466.653  2617.213  3262.990  2759.8645  3455.914   9236.749   100 ab 
 langtang  3590.978  3859.203  4651.293  4223.0400  5028.727  10117.352   100  b 
    akrun   742.534   842.931  1201.107   981.4145  1143.847   6101.451   100 a  
     paul  2970.088  3225.846  3707.531  3559.8195  3888.943   6502.238   100  b 
  Macosso 34435.574 36336.227 41126.294 38380.3570 41869.482 150544.220   100   c

data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN", 
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN", 
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17", 
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN", 
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", 
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L, 
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
data = do.call("rbind", replicate(1000, data, simplify = FALSE))

library(dtplyr)

res = microbenchmark::microbenchmark(
    me = data[!unique(which(data == NaN, arr.ind=T)[,1])],
    langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),
    akrun  = data[data[, Reduce(`&`, lapply(.SD, function(x)
     !is.nan(x) & is.finite(x))), .SDcols = -1]],
    paul = data %>% 
        lazy_dt %>%  
        filter(across(2:5, ~ .x != "NaN")) %>% 
        as.data.table,

    Macosso = {data$Row <- row.names(data);
            rm_rw <- data[apply(data, 1, 
                    function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row);
            data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
        } 

    )

CodePudding user response：

A solution based on dtplyr:

library(dtplyr)
library(dplyr)
library(data.table)

data <- structure(
list(date=c("2021-11-24","2021-11-24","2021-11-26",
"2021-11-24","2021-11-26","2021-11-24",
"2021-11-24","2021-11-26","2021-11-26",
"2021-11-26","2021-11-26"),
open=c("NaN","NaN","0.43","0.17","0.19","0.15",
"NaN","NaN","NaN","NaN","NaN"),
high=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN",
"NaN","NaN","NaN","NaN"),low=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),close=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),volume=c(0L,0L,2L,10L,75L,1L,0L,0L,0L,0L,0L)),row.names=c(NA,-11L),class=c("data.table","data.frame"))

data %>% 
  lazy_dt %>%  
  filter(across(2:5, ~ !.x %in% c("NaN","Inf"))) %>%
  as.data.table

#>          date open high  low close volume
#> 1: 2021-11-26 0.43 0.43 0.43  0.43      2
#> 2: 2021-11-24 0.17 0.17 0.17  0.17     10
#> 3: 2021-11-26 0.19 0.19 0.19  0.19     75
#> 4: 2021-11-24 0.15 0.15 0.15  0.15      1

In case the NaN and Inf are not character type, as @akrun mentions in his answer, then the following is also a solution:

library(data.table)

# No need of this, if NaN e Inf are not strings
data <- type.convert(data, as.is = TRUE)

data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]

#>          date open high  low close volume
#> 1: 2021-11-26 0.43 0.43 0.43  0.43      2
#> 2: 2021-11-24 0.17 0.17 0.17  0.17     10
#> 3: 2021-11-26 0.19 0.19 0.19  0.19     75
#> 4: 2021-11-24 0.15 0.15 0.15  0.15      1