How to add a column next to another consecutively without using loops in R?-CodePudding

I have this dataframe:

set.seed(42)  ## for sake of reproducibility
df <- data.frame("time"=c(1:40), "Var1"=sample(1:100, size=40), 
                 "Var2"=sample(1:100, size=40), "Var3"=sample(1:100, size=40))
head(df)
#   time Var1 Var2 Var3
# 1    1   49    3   38
# 2    2   65   21    1
# 3    3   25    2   13
# 4    4   74   58   78
# 5    5   18   10    5
# 6    6  100   40   73

As I want to get the quantiles per column, I have this code:

(quantiles <- as.data.frame(apply(df[2:4] , 2 , quantile, probs=seq(0, 1, 1/10), na.rm=TRUE)))
#       Var1 Var2  Var3
# 0%     2.0  2.0   1.0
# 10%    5.9  8.9  11.4
# 20%   19.6 17.6  15.8
# 30%   25.7 31.1  28.1
# 40%   35.2 41.2  35.8
# 50%   42.5 51.0  42.5
# 60%   53.2 57.4  56.4
# 70%   67.3 70.2  66.0
# 80%   80.8 80.4  78.6
# 90%   89.4 90.5  90.1
# 100% 100.0 99.0 100.0

My objective is to add into my original dataframe (df) a column with each of the quantiles per variable. In order to achieve that, I have this code:

df$QuantVar1 <- .bincode(x=df$Var1, breaks=quantiles$Var1, include.lowest=T, right=T)
df$QuantVar2 <- .bincode(x=df$Var2, breaks=quantiles$Var2, include.lowest=T, right=T)
df$QuantVar3 <- .bincode(x=df$Var3, breaks=quantiles$Var3, include.lowest=T, right=T)

head(df)
#   time Var1 Var2 Var3 QuantVar1 QuantVar2 QuantVar3
# 1    1   49    3   38         6         1         5
# 2    2   65   21    1         7         3         1
# 3    3   25    2   13         3         1         2
# 4    4   74   58   78         8         7         8
# 5    5   18   10    5         2         2         1
# 6    6  100   40   73        10         4         8

(Note that I use .bincode because I didn't have unique breaks and I found this solution).

As I want each new column next to the original variable, I relocate each of them manually:

library(dplyr); library(tidyft)

df <- df %>%
  relocate(QuantVar1, .after  = Var1)
df <- df %>%
  relocate(QuantVar2, .after  = Var2)
df <- df %>%
  relocate(QuantVar3, .after  = Var3)

head(df)
#   time Var1 QuantVar1 Var2 QuantVar2 Var3 QuantVar3
# 1    1   49         6    3         1   38         5
# 2    2   65         7   21         3    1         1
# 3    3   25         3    2         1   13         2
# 4    4   74         8   58         7   78         8
# 5    5   18         2   10         2    5         1
# 6    6  100        10   40         4   73         8

The code works perfectly. But... what if I have 100 variables or more? I cannot do the process 100 times or even more.

I want to avoid using loops and I have been trying to use the lapply family. I have already seen how to add new columns with lapply in this post but I don't know if there is a way to add the new column next to the column that it is using as I have in the above example.

Does anybody have an idea about how to do it?

CodePudding user response：

You are on the right track. You can do this with lapply:

cols_to_include  <- grep("^Var", names(df), value = TRUE) # "Var1" "Var2" "Var3"

new_names  <- paste0("Quant", cols_to_include) # "QuantVar1" "QuantVar2" "QuantVar3"

df[new_names]  <- lapply(
    cols_to_include, 
    \(col) {
        .bincode(
            x = df[[col]],
            breaks = quantiles[[col]],
            include.lowest = TRUE,
            right = TRUE
        )
    }
) 

head(df)
#   time Var1 Var2 Var3 QuantVar1 QuantVar2 QuantVar3
# 1    1   53   83   49         5         9         5
# 2    2   56   64   61         6         7         6
# 3    3   13   77   20         2         9         2
# 4    4  100   73    6        10         8         1
# 5    5   87   75   65         9         8         6
# 6    6   52    9   92         5         2         9

Note: output not identical to yours as we did not use a fixed seed but it should work.

Sorting the columns

In this case you can just order the columns by sorting on the digits in each column name:

new_order  <- order(gsub("\\D ", "", names(df)))

# Change order
df  <- df[new_order]

head(df)
#   time Var1 QuantVar1 Var2 QuantVar2 Var3 QuantVar3
# 1    1   53         5   83         9   49         5
# 2    2   56         6   64         7   61         6
# 3    3   13         2   77         9   20         2
# 4    4  100        10   73         8    6         1
# 5    5   87         9   75         8   65         6
# 6    6   52         5    9         2   92         9

CodePudding user response：

Use matrixStats::colQuantiles and cut them, and cbind the result.

qu <- t(matrixStats::colQuantiles(as.matrix(df[2:length(df)]), probs=seq(0, 1, 1/10))) |> as.data.frame()
res <- cbind(df, qu=mapply(cut, df[-1], breaks=qu, labels=list(1:10), include.lowest=TRUE))
head(res)
#   time Var1 Var2 Var3 qu.Var1 qu.Var2 qu.Var3
# 1    1   49    3   38       6       1       5
# 2    2   65   21    1       7       3       1
# 3    3   25    2   13       3       1       2
# 4    4   74   58   78       8       7       8
# 5    5   18   10    5       2       2       1
# 6    6  100   40   73      10       4       8

Data:

df <- structure(list(time = 1:40, Var1 = c(49L, 65L, 25L, 74L, 18L, 
100L, 47L, 24L, 71L, 89L, 37L, 20L, 26L, 3L, 41L, 27L, 36L, 5L, 
34L, 87L, 58L, 42L, 93L, 30L, 43L, 15L, 22L, 80L, 8L, 84L, 68L, 
96L, 4L, 50L, 95L, 88L, 67L, 6L, 63L, 2L), Var2 = c(3L, 21L, 
2L, 58L, 10L, 40L, 5L, 33L, 49L, 73L, 29L, 76L, 84L, 9L, 35L, 
16L, 69L, 98L, 82L, 24L, 18L, 88L, 55L, 95L, 99L, 57L, 42L, 80L, 
13L, 53L, 54L, 32L, 60L, 90L, 43L, 97L, 48L, 8L, 67L, 78L), Var3 = c(38L, 
1L, 13L, 78L, 5L, 73L, 55L, 16L, 90L, 43L, 42L, 92L, 57L, 29L, 
25L, 63L, 32L, 81L, 14L, 6L, 47L, 91L, 62L, 37L, 31L, 34L, 83L, 
100L, 74L, 15L, 75L, 89L, 60L, 12L, 26L, 41L, 99L, 2L, 56L, 24L
)), class = "data.frame", row.names = c(NA, -40L))

CodePudding user response：

A data.table approach - using a for-loop, nevertheless it should be faster than the corresponding lapply solution.

library(data.table)

DT <- data.frame(
    "time" = c(1:40),
    "Var1" = sample(1:100, size = 40),
    "Var2" = sample(1:100, size = 40),
    "Var3" = sample(1:100, size = 40)
  )

setDT(DT)

quantilesDT <- as.data.table(apply(df[2:4] , 2, quantile, probs = seq(0, 1, 1 / 10), na.rm = TRUE))

for (colname in names(quantilesDT)){
  set(
    DT,
    j = paste0(colname, "Quant"),
    value = .bincode(
      x = DT[[colname]],
      breaks = quantilesDT[[colname]] ,
      include.lowest = TRUE,
      right = TRUE
    )
  ) 
}

setcolorder(DT, sort(names(DT)))

print(DT)