I am working on a bigger dataset than I attached below and I need to encode double
type columns again. I tried to use prettyNum
in a function called encoder
but it works really slow on my data. Here is how I tried;
library(data.table)
set.seed(1453)
sample_data <- data.frame(a=sample(1:1000,100,replace=T),
b=sample(1:1000,100,replace=T),
c=sample(seq(1,1000,0.01),100,replace=T),
d=sample(seq(1,1000,0.01),100,replace=T),
e=sample(seq(1,1000,0.01),100,replace=T),
f=sample(seq(1,1000,0.01),100,replace=T),
g=sample(seq(1,1000,0.01),100,replace=T),
h=sample(seq(1,1000,0.01),100,replace=T),
i=sample(LETTERS,1000,replace=T),
j=sample(letters,1000,replace=T))
setDT(sample_data)
options(warn=-1)
double_cols <- which(sapply(sample_data,is.double))
encoder <- function(x) prettyNum(x*1e4,big.mark = '.')
sample_data[,(double_cols):=lapply(.SD,encoder),.SDcols=double_cols]
It works already but I believe that there is a way faster solution,
Thanks in advance.
CodePudding user response:
You could use format
instead of prettyNum
:
library(data.table)
setDT(sample_data)
sample_data1 <- copy(sample_data)
sample_data2 <- copy(sample_data)
options(warn=-1)
encoder1 <- function(x) prettyNum(x*1e4,big.mark = '.')
encoder2 <- function(x) format(x*1e4,big.mark = '.', trim = TRUE)
system.time(sample_data1[,(double_cols):=lapply(.SD,encoder1),.SDcols=double_cols])
user system total
1.27 0.01 1.26
system.time(sample_data2[,(double_cols):=lapply(.SD,encoder2),.SDcols=double_cols])
user system total
0.08 0.00 0.08
CodePudding user response:
Maybe try with sprintf
. The gain appears significant.
1. With your Function
- Code
set.seed(1453)
sample_data <- data.frame(a=sample(1:1000,100,replace=T),
b=sample(1:1000,100,replace=T),
c=sample(seq(1,1000,0.01),100,replace=T),
d=sample(seq(1,1000,0.01),100,replace=T),
e=sample(seq(1,1000,0.01),100,replace=T),
f=sample(seq(1,1000,0.01),100,replace=T),
g=sample(seq(1,1000,0.01),100,replace=T),
h=sample(seq(1,1000,0.01),100,replace=T),
i=sample(LETTERS,1000,replace=T),
j=sample(letters,1000,replace=T))
double_cols <- which(sapply(sample_data,is.double))
encoder <- function(x) prettyNum(x*1e4, big.mark = '.')
system.time(setDT(sample_data)[,(double_cols):=lapply(.SD,encoder),.SDcols=is.double][])
- Output
utilisateur système écoulé
2.75 0.00 2.86
2. With sprintf
Function
- Code
set.seed(1453)
sample_data <- data.frame(a=sample(1:1000,100,replace=T),
b=sample(1:1000,100,replace=T),
c=sample(seq(1,1000,0.01),100,replace=T),
d=sample(seq(1,1000,0.01),100,replace=T),
e=sample(seq(1,1000,0.01),100,replace=T),
f=sample(seq(1,1000,0.01),100,replace=T),
g=sample(seq(1,1000,0.01),100,replace=T),
h=sample(seq(1,1000,0.01),100,replace=T),
i=sample(LETTERS,1000,replace=T),
j=sample(letters,1000,replace=T))
double_cols <- which(sapply(sample_data,is.double))
encoder2 <- function(x) prettyNum(sprintf("%.9g", 1e4 * x), big.mark = '.')
system.time(setDT(sample_data)[,(double_cols):=lapply(.SD,encoder2),.SDcols=is.double][])
- Output
utilisateur système écoulé
0.16 0.00 0.16