I wrote a function which gives me the expected result when I run it on a numeric or a list :
library(data.table)
my.fun <- function(X, k=0, chaine="") {
Y = X - (X %/% 1e8) * (10**8)
while (floor(Y / (37**k))) {
k <- k 1
}
vloop <- seq(from = k-1, to=0, by=-1)
for (i in vloop) {
fixe <- floor(Y / (37**i))
if (fixe>9) {
if (fixe==36) { mon.car <- "" } else { mon.car <- intToUtf8(fixe 55) }
} else { mon.car <- fixe }
ext <- fixe*(37**i)
Y <- Y-ext
chaine <- stringr::str_c(chaine, mon.car)
}
chaine
}
my.fun(543916151)
foo <- list(543916151, 400001449)
lapply(foo, my.fun)
But when I want to use it on the columns of a data.table
, I don't always get the expected result :
DT1 <- data.table(V1 = c(505926406, 515349272, 543916151),
V2 = c(505926406, 400000336, 400001449))
DT2 <- data.table(V1 = c(543916151),
V2 = c(400001449))
DT3 <- data.table(V1 = c(543916151, 543916151),
V2 = c(400001449, 400000336))
cols <- c("V1", "V2")
newcols <- c("C1", "C2")
DT1[, (newcols) := lapply(.SD, my.fun), .SDcols = cols]
DT2[, (newcols) := lapply(.SD, my.fun), .SDcols = cols]
DT3[, (newcols) := lapply(.SD, my.fun), .SDcols = cols]
- DT1 : third row of C1 is incorrect
- DT2 : with just one row in the data.table, it's correct.
- DT3 : with two rows of the same values, "strange" behavior.
The lapply
returns warnings :
the condition has length > 1 and only the first element will be ...
I think I understand that the issue is due to the if that is not vectorized/vectorizable ? And that would be the reason why the function does not run correctly on columns ?
My knowledge is not deep enough to deal with this issue. So thank you for your help.
---EDIT---
Here is the warnings I get :
1: In while (floor(Y/(37^k))) { ... :
la condition a une longueur > 1 et seul le premier élément est utilisé
2: In while (floor(Y/(37^k))) { ... :
la condition a une longueur > 1 et seul le premier élément est utilisé
3: In while (floor(Y/(37^k))) { ... :
la condition a une longueur > 1 et seul le premier élément est utilisé
4: In while (floor(Y/(37^k))) { ... :
la condition a une longueur > 1 et seul le premier élément est utilisé
5: In while (floor(Y/(37^k))) { ... :
la condition a une longueur > 1 et seul le premier élément est utilisé
6: In if (fixe > 9) { ... :
la condition a une longueur > 1 et seul le premier élément est utilisé
7: In if (fixe > 9) { ... :
la condition a une longueur > 1 et seul le premier élément est utilisé
...
...
CodePudding user response:
DT1[, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = c("V1", "V2")]
DT2[, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = c("V1", "V2")]
DT3[, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = c("V1", "V2")]
Above still goes wrong on cases where both V1 and V2 are equal in multiple rows. You could create id and remove it again after grouping
DT4[, id := .I][, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = id][, id := NULL]
CodePudding user response:
Vectorize my.fun
, either by re-writing it or by using Vectorize
:
library(data.table)
my.fun <- function(X, k=0, chaine="") {
Y <- X - (X %/% 1e8) * (10**8)
while (floor(Y / (37**k))) {
k <- k 1L
}
vloop <- seq(from = k-1, to=0, by=-1)
for (i in vloop) {
fixe <- floor(Y / (37**i))
if (fixe>9) {
if (fixe==36) { mon.car <- "" } else { mon.car <- intToUtf8(fixe 55) }
} else { mon.car <- fixe }
ext <- fixe*(37**i)
Y <- Y-ext
chaine <- stringr::str_c(chaine, mon.car)
}
chaine
}
# vectorized version of my.fun
my.fun.vecb <- function(X, k = 0, chaine = character(length(X))) {
Y <- X %% 1e8
k <- pmax(ceiling(log(Y, 37)), k) - 1
k37 <- 37^k
mon.car <- function(fixe, n) {
chr <- character(n)
blnGT9 <- fixe > 9
blnLetter <- blnGT9 & fixe != 36
chr[blnLetter] <- intToUtf8(fixe[blnLetter] 55, multiple = TRUE)
chr[!blnGT9] <- as.character(fixe[!blnGT9])
return(chr)
}
blnk <- rep(TRUE, length(X))
while (length(Y)) {
k37 <- 37^k
chaine[blnk] <- paste0(chaine[blnk], mon.car(Y %/% k37, length(Y)))
blnk[blnk] <- k > 0
k <- k[blnk] - 1
Y <- Y[blnk] %% k37[blnk]
}
return(chaine)
}
DT1a <- data.table(V1 = c(505926406, 515349272, 543916151),
V2 = c(505926406, 400000336, 400001449))
DT2a <- data.table(V1 = c(543916151),
V2 = c(400001449))
DT3a <- data.table(V1 = c(543916151, 543916151),
V2 = c(400001449, 400000336))
cols <- c("V1", "V2")
newcols <- c("C1", "C2")
# function to call my.fun element-wise over a data.table
f_loop <- function(dt) {
dt[, (newcols) := character(0)]
for (i in 1:nrow(dt)) {
for (j in seq_along(cols)){
dt[i, (newcols[j]) := my.fun(unlist(.SD, use.names = FALSE)), .SDcols = cols[j]]
}
}
}
DT1b <- copy(DT1a)[, (newcols) := lapply(.SD, my.fun.vecb), .SDcols = cols]
DT2b <- copy(DT2a)[, (newcols) := lapply(.SD, my.fun.vecb), .SDcols = cols]
DT3b <- copy(DT3a)[, (newcols) := lapply(.SD, my.fun.vecb), .SDcols = cols]
my.fun.vecc <- Vectorize(my.fun) # alternative vectorized version of my.fun
DT1c <- copy(DT1a)[, (newcols) := lapply(.SD, my.fun.vecc), .SDcols = cols]
DT2c <- copy(DT2a)[, (newcols) := lapply(.SD, my.fun.vecc), .SDcols = cols]
DT3c <- copy(DT3a)[, (newcols) := lapply(.SD, my.fun.vecc), .SDcols = cols]
f_loop(DT1a)
f_loop(DT2a)
f_loop(DT3a)
# check that the versions of the data.tables are all the same
identical(list(DT1a, DT2a, DT3a), list(DT1b, DT2b, DT3b))
#> [1] TRUE
identical(list(DT1a, DT2a, DT3a), list(DT1c, DT2c, DT3c))
#> [1] TRUE
Created on 2021-11-08 by the reprex package (v2.0.1)