I have this dataframe:
set.seed(42) ## for sake of reproducibility
df <- data.frame("time"=c(1:40), "Var1"=sample(1:100, size=40),
"Var2"=sample(1:100, size=40), "Var3"=sample(1:100, size=40))
head(df)
# time Var1 Var2 Var3
# 1 1 49 3 38
# 2 2 65 21 1
# 3 3 25 2 13
# 4 4 74 58 78
# 5 5 18 10 5
# 6 6 100 40 73
As I want to get the quantiles per column, I have this code:
(quantiles <- as.data.frame(apply(df[2:4] , 2 , quantile, probs=seq(0, 1, 1/10), na.rm=TRUE)))
# Var1 Var2 Var3
# 0% 2.0 2.0 1.0
# 10% 5.9 8.9 11.4
# 20% 19.6 17.6 15.8
# 30% 25.7 31.1 28.1
# 40% 35.2 41.2 35.8
# 50% 42.5 51.0 42.5
# 60% 53.2 57.4 56.4
# 70% 67.3 70.2 66.0
# 80% 80.8 80.4 78.6
# 90% 89.4 90.5 90.1
# 100% 100.0 99.0 100.0
My objective is to add into my original dataframe (df
) a column with each of the quantiles per variable. In order to achieve that, I have this code:
df$QuantVar1 <- .bincode(x=df$Var1, breaks=quantiles$Var1, include.lowest=T, right=T)
df$QuantVar2 <- .bincode(x=df$Var2, breaks=quantiles$Var2, include.lowest=T, right=T)
df$QuantVar3 <- .bincode(x=df$Var3, breaks=quantiles$Var3, include.lowest=T, right=T)
head(df)
# time Var1 Var2 Var3 QuantVar1 QuantVar2 QuantVar3
# 1 1 49 3 38 6 1 5
# 2 2 65 21 1 7 3 1
# 3 3 25 2 13 3 1 2
# 4 4 74 58 78 8 7 8
# 5 5 18 10 5 2 2 1
# 6 6 100 40 73 10 4 8
(Note that I use .bincode
because I didn't have unique breaks and I found this solution).
As I want each new column next to the original variable, I relocate each of them manually:
library(dplyr); library(tidyft)
df <- df %>%
relocate(QuantVar1, .after = Var1)
df <- df %>%
relocate(QuantVar2, .after = Var2)
df <- df %>%
relocate(QuantVar3, .after = Var3)
head(df)
# time Var1 QuantVar1 Var2 QuantVar2 Var3 QuantVar3
# 1 1 49 6 3 1 38 5
# 2 2 65 7 21 3 1 1
# 3 3 25 3 2 1 13 2
# 4 4 74 8 58 7 78 8
# 5 5 18 2 10 2 5 1
# 6 6 100 10 40 4 73 8
The code works perfectly. But... what if I have 100 variables or more? I cannot do the process 100 times or even more.
I want to avoid using loops and I have been trying to use the lapply
family.
I have already seen how to add new columns with lapply
in this post but I don't know if there is a way to add the new column next to the column that it is using as I have in the above example.
Does anybody have an idea about how to do it?
CodePudding user response:
You are on the right track. You can do this with lapply
:
cols_to_include <- grep("^Var", names(df), value = TRUE) # "Var1" "Var2" "Var3"
new_names <- paste0("Quant", cols_to_include) # "QuantVar1" "QuantVar2" "QuantVar3"
df[new_names] <- lapply(
cols_to_include,
\(col) {
.bincode(
x = df[[col]],
breaks = quantiles[[col]],
include.lowest = TRUE,
right = TRUE
)
}
)
head(df)
# time Var1 Var2 Var3 QuantVar1 QuantVar2 QuantVar3
# 1 1 53 83 49 5 9 5
# 2 2 56 64 61 6 7 6
# 3 3 13 77 20 2 9 2
# 4 4 100 73 6 10 8 1
# 5 5 87 75 65 9 8 6
# 6 6 52 9 92 5 2 9
Note: output not identical to yours as we did not use a fixed seed but it should work.
Sorting the columns
In this case you can just order the columns by sorting on the digits in each column name:
new_order <- order(gsub("\\D ", "", names(df)))
# Change order
df <- df[new_order]
head(df)
# time Var1 QuantVar1 Var2 QuantVar2 Var3 QuantVar3
# 1 1 53 5 83 9 49 5
# 2 2 56 6 64 7 61 6
# 3 3 13 2 77 9 20 2
# 4 4 100 10 73 8 6 1
# 5 5 87 9 75 8 65 6
# 6 6 52 5 9 2 92 9
CodePudding user response:
Use matrixStats::colQuantiles
and cut
them, and cbind
the result.
qu <- t(matrixStats::colQuantiles(as.matrix(df[2:length(df)]), probs=seq(0, 1, 1/10))) |> as.data.frame()
res <- cbind(df, qu=mapply(cut, df[-1], breaks=qu, labels=list(1:10), include.lowest=TRUE))
head(res)
# time Var1 Var2 Var3 qu.Var1 qu.Var2 qu.Var3
# 1 1 49 3 38 6 1 5
# 2 2 65 21 1 7 3 1
# 3 3 25 2 13 3 1 2
# 4 4 74 58 78 8 7 8
# 5 5 18 10 5 2 2 1
# 6 6 100 40 73 10 4 8
Data:
df <- structure(list(time = 1:40, Var1 = c(49L, 65L, 25L, 74L, 18L,
100L, 47L, 24L, 71L, 89L, 37L, 20L, 26L, 3L, 41L, 27L, 36L, 5L,
34L, 87L, 58L, 42L, 93L, 30L, 43L, 15L, 22L, 80L, 8L, 84L, 68L,
96L, 4L, 50L, 95L, 88L, 67L, 6L, 63L, 2L), Var2 = c(3L, 21L,
2L, 58L, 10L, 40L, 5L, 33L, 49L, 73L, 29L, 76L, 84L, 9L, 35L,
16L, 69L, 98L, 82L, 24L, 18L, 88L, 55L, 95L, 99L, 57L, 42L, 80L,
13L, 53L, 54L, 32L, 60L, 90L, 43L, 97L, 48L, 8L, 67L, 78L), Var3 = c(38L,
1L, 13L, 78L, 5L, 73L, 55L, 16L, 90L, 43L, 42L, 92L, 57L, 29L,
25L, 63L, 32L, 81L, 14L, 6L, 47L, 91L, 62L, 37L, 31L, 34L, 83L,
100L, 74L, 15L, 75L, 89L, 60L, 12L, 26L, 41L, 99L, 2L, 56L, 24L
)), class = "data.frame", row.names = c(NA, -40L))
CodePudding user response:
A data.table approach - using a for-loop, nevertheless it should be faster than the corresponding lapply solution.
library(data.table)
DT <- data.frame(
"time" = c(1:40),
"Var1" = sample(1:100, size = 40),
"Var2" = sample(1:100, size = 40),
"Var3" = sample(1:100, size = 40)
)
setDT(DT)
quantilesDT <- as.data.table(apply(df[2:4] , 2, quantile, probs = seq(0, 1, 1 / 10), na.rm = TRUE))
for (colname in names(quantilesDT)){
set(
DT,
j = paste0(colname, "Quant"),
value = .bincode(
x = DT[[colname]],
breaks = quantilesDT[[colname]] ,
include.lowest = TRUE,
right = TRUE
)
)
}
setcolorder(DT, sort(names(DT)))
print(DT)