Home > other >  Bad behavior of a function when lapplying in a data table
Bad behavior of a function when lapplying in a data table

Time:11-09

I wrote a function which gives me the expected result when I run it on a numeric or a list :

library(data.table)

my.fun <- function(X, k=0, chaine="") {
  Y = X - (X %/% 1e8) * (10**8)
  while (floor(Y / (37**k))) {
    k <- k   1
  }
  vloop <- seq(from = k-1, to=0, by=-1)
  for (i in vloop) {
    fixe <- floor(Y / (37**i))
    if (fixe>9) {
      if (fixe==36) { mon.car <- "" } else { mon.car <- intToUtf8(fixe 55) }
    } else { mon.car <- fixe }
    ext <- fixe*(37**i)
    Y <- Y-ext
    chaine <- stringr::str_c(chaine, mon.car)
  }
  chaine
}

my.fun(543916151)

foo <- list(543916151, 400001449)
lapply(foo, my.fun)

But when I want to use it on the columns of a data.table, I don't always get the expected result :

DT1 <- data.table(V1 = c(505926406, 515349272, 543916151),
                 V2 = c(505926406, 400000336, 400001449))

DT2 <- data.table(V1 = c(543916151),
                  V2 = c(400001449))

DT3 <- data.table(V1 = c(543916151, 543916151),
                  V2 = c(400001449, 400000336))

cols <- c("V1", "V2")
newcols <- c("C1", "C2")

DT1[, (newcols) := lapply(.SD, my.fun), .SDcols = cols]
DT2[, (newcols) := lapply(.SD, my.fun), .SDcols = cols]
DT3[, (newcols) := lapply(.SD, my.fun), .SDcols = cols]
  • DT1 : third row of C1 is incorrect
  • DT2 : with just one row in the data.table, it's correct.
  • DT3 : with two rows of the same values, "strange" behavior.

The lapply returns warnings : the condition has length > 1 and only the first element will be ...

I think I understand that the issue is due to the if that is not vectorized/vectorizable ? And that would be the reason why the function does not run correctly on columns ?

My knowledge is not deep enough to deal with this issue. So thank you for your help.

---EDIT---

Here is the warnings I get :

1: In while (floor(Y/(37^k))) { ... :
  la condition a une longueur > 1 et seul le premier élément est utilisé
2: In while (floor(Y/(37^k))) { ... :
  la condition a une longueur > 1 et seul le premier élément est utilisé
3: In while (floor(Y/(37^k))) { ... :
  la condition a une longueur > 1 et seul le premier élément est utilisé
4: In while (floor(Y/(37^k))) { ... :
  la condition a une longueur > 1 et seul le premier élément est utilisé
5: In while (floor(Y/(37^k))) { ... :
  la condition a une longueur > 1 et seul le premier élément est utilisé
6: In if (fixe > 9) { ... :
  la condition a une longueur > 1 et seul le premier élément est utilisé
7: In if (fixe > 9) { ... :
  la condition a une longueur > 1 et seul le premier élément est utilisé
...
...

CodePudding user response:

DT1[, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = c("V1", "V2")]
DT2[, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = c("V1", "V2")]
DT3[, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = c("V1", "V2")]

Above still goes wrong on cases where both V1 and V2 are equal in multiple rows. You could create id and remove it again after grouping

DT4[, id := .I][, (newcols) := lapply(.SD, my.fun), .SDcols = cols, by = id][, id := NULL]

CodePudding user response:

Vectorize my.fun, either by re-writing it or by using Vectorize:

library(data.table)

my.fun <- function(X, k=0, chaine="") {
  Y <- X - (X %/% 1e8) * (10**8)
  while (floor(Y / (37**k))) {
    k <- k   1L
  }
  vloop <- seq(from = k-1, to=0, by=-1)
  for (i in vloop) {
    fixe <- floor(Y / (37**i))
    if (fixe>9) {
      if (fixe==36) { mon.car <- "" } else { mon.car <- intToUtf8(fixe 55) }
    } else { mon.car <- fixe }
    ext <- fixe*(37**i)
    Y <- Y-ext
    chaine <- stringr::str_c(chaine, mon.car)
  }
  chaine
}

# vectorized version of my.fun
my.fun.vecb <- function(X, k = 0, chaine = character(length(X))) {
  Y <- X %% 1e8
  k <- pmax(ceiling(log(Y, 37)), k) - 1
  k37 <- 37^k
  
  mon.car <- function(fixe, n) {
    chr <- character(n)
    blnGT9 <- fixe > 9
    blnLetter <- blnGT9 & fixe != 36
    chr[blnLetter] <- intToUtf8(fixe[blnLetter]   55, multiple = TRUE)
    chr[!blnGT9] <- as.character(fixe[!blnGT9])
    return(chr)
  }
  
  blnk <- rep(TRUE, length(X))
  
  while (length(Y)) {
    k37 <- 37^k
    chaine[blnk] <- paste0(chaine[blnk], mon.car(Y %/% k37, length(Y)))
    blnk[blnk] <- k > 0
    k <- k[blnk] - 1
    Y <- Y[blnk] %% k37[blnk]
  }
  
  return(chaine)
}

DT1a <- data.table(V1 = c(505926406, 515349272, 543916151),
                  V2 = c(505926406, 400000336, 400001449))

DT2a <- data.table(V1 = c(543916151),
                  V2 = c(400001449))

DT3a <- data.table(V1 = c(543916151, 543916151),
                  V2 = c(400001449, 400000336))

cols <- c("V1", "V2")
newcols <- c("C1", "C2")

# function to call my.fun element-wise over a data.table
f_loop <- function(dt) {
  dt[, (newcols) := character(0)]
  for (i in 1:nrow(dt)) {
    for (j in seq_along(cols)){
      dt[i, (newcols[j]) := my.fun(unlist(.SD, use.names = FALSE)), .SDcols = cols[j]]
    }
  }
}

DT1b <- copy(DT1a)[, (newcols) := lapply(.SD, my.fun.vecb), .SDcols = cols]
DT2b <- copy(DT2a)[, (newcols) := lapply(.SD, my.fun.vecb), .SDcols = cols]
DT3b <- copy(DT3a)[, (newcols) := lapply(.SD, my.fun.vecb), .SDcols = cols]

my.fun.vecc <- Vectorize(my.fun) # alternative vectorized version of my.fun
DT1c <- copy(DT1a)[, (newcols) := lapply(.SD, my.fun.vecc), .SDcols = cols]
DT2c <- copy(DT2a)[, (newcols) := lapply(.SD, my.fun.vecc), .SDcols = cols]
DT3c <- copy(DT3a)[, (newcols) := lapply(.SD, my.fun.vecc), .SDcols = cols]

f_loop(DT1a)
f_loop(DT2a)
f_loop(DT3a)

# check that the versions of the data.tables are all the same
identical(list(DT1a, DT2a, DT3a), list(DT1b, DT2b, DT3b))
#> [1] TRUE
identical(list(DT1a, DT2a, DT3a), list(DT1c, DT2c, DT3c))
#> [1] TRUE

Created on 2021-11-08 by the reprex package (v2.0.1)
  • Related