This works for a toy example, but I think there must be a better way to do this (faster, less memory) for larger dataframes. Any suggestions appreciated!
library(tidyverse)
library(tictoc)
cyl <- tibble(integer_value = unique(mtcars$cyl),
as_a_string = paste(unique(mtcars$cyl), " cylinders"))%>%
mutate(variable = "cyl")
gear <- tibble(integer_value = unique(mtcars$gear),
as_a_string = paste(unique(mtcars$cyl), " gears"))%>%
mutate(variable = "gear")
carb <- tibble(integer_value = unique(mtcars$carb),
as_a_string = paste(unique(mtcars$carb)," carburetors"))%>%
mutate(variable = "carb")
vs <- tibble(integer_value = unique(mtcars$vs),
as_a_string = c("V shaped", "straight"))%>%
mutate(variable = "vs")
am <- tibble(integer_value = unique(mtcars$vs),
as_a_string = c("Automatic", "Manual"))%>%
mutate(variable = "am")
factor_info <- rbind(cyl,gear,carb,vs,am)%>%
select(variable,everything())
df <- mtcars
tic()
for(var in unique(factor_info$variable)){
col <- mtcars%>%
select(all_of(var))%>%
mutate(variable = all_of(var))%>%
rename(integer_value = all_of(var))
fac <- factor_info%>%
filter(variable == all_of(var))
df[[all_of(var)]] <- inner_join(col, fac)%>%
select(as_a_string)%>%
pull()
}
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
df <- df%>%
as_tibble() %>%
mutate(across(where(is.character), factor))
toc()
#> 0.172 sec elapsed
Created on 2022-02-25 by the reprex package (v2.0.1)
CodePudding user response:
I don't have enough rep to add a comment. Here is another variant to put under TarJae's answer. Using purrr::modify_if
:
library(tidyverse)
mtcars %>%
modify_if(is.numeric, as.factor) %>%
str()
#> 'data.frame': 32 obs. of 11 variables:
#> $ mpg : Factor w/ 25 levels "10.4","13.3",..: 16 16 19 17 13 12 3 20 19 14 ...
#> $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
#> $ disp: Factor w/ 27 levels "71.1","75.7",..: 13 13 6 16 23 15 23 12 10 14 ...
#> $ hp : Factor w/ 22 levels "52","62","65",..: 11 11 6 11 15 9 20 2 7 13 ...
#> $ drat: Factor w/ 22 levels "2.76","2.93",..: 16 16 15 5 6 1 7 11 17 17 ...
#> $ wt : Factor w/ 29 levels "1.513","1.615",..: 9 12 7 16 18 19 21 15 13 18 ...
#> $ qsec: Factor w/ 30 levels "14.5","14.6",..: 6 10 22 24 10 29 5 27 30 19 ...
#> $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#> $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
#> $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
#> $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...
Created on 2022-02-26 by the reprex package (v2.0.1)
CodePudding user response:
as.factor(x)
is faster and more efficient than factor(x)
when x
is of type integer
and length(x)
is large. The categorical variables in mtcars
are integer-valued but stored as double
:
nms <- c("cyl", "vs", "am", "gear", "carb")
vapply(mtcars[nms], typeof, "")
## cyl vs am gear carb
## "double" "double" "double" "double" "double"
In this situation, you can coerce to factor efficiently with
dd <- mtcars
dd[nms] <- lapply(dd[nms], function(x) as.factor(as.integer(x)))
then rename the levels, e.g., with
lvs <- list(cyl = paste(levels(dd$cyl), "cylinders"),
vs = c("V-shaped", "straight"),
am = c("automatic", "manual"),
gear = paste(levels(dd$gear), "gears"),
carb = paste(levels(dd$carb), "carburetors"))
dd[nms] <- Map(`levels<-`, dd[nms], lvs[nms])
head(dd[nms])
## cyl vs am gear carb
## Mazda RX4 6 cylinders V-shaped manual 4 gears 4 carburetors
## Mazda RX4 Wag 6 cylinders V-shaped manual 4 gears 4 carburetors
## Datsun 710 4 cylinders straight manual 4 gears 1 carburetors
## Hornet 4 Drive 6 cylinders straight automatic 3 gears 1 carburetors
## Hornet Sportabout 8 cylinders V-shaped automatic 3 gears 2 carburetors
## Valiant 6 cylinders straight automatic 3 gears 1 carburetors
head(mtcars[nms])
## cyl vs am gear carb
## Mazda RX4 6 0 1 4 4
## Mazda RX4 Wag 6 0 1 4 4
## Datsun 710 4 1 1 4 1
## Hornet 4 Drive 6 1 0 3 1
## Hornet Sportabout 8 0 0 3 2
## Valiant 6 1 0 3 1
FWIW, this entire operation takes less than a millisecond on my machine.
## Unit: microseconds
## expr min lq mean median uq max neval
## expr 232.429 251.33 289.4895 259.7555 279.5995 2311.457 100
CodePudding user response:
New answer deleted the first one. I think you need fct_relabel
from forcats
package: elapsed 0.04 sec:
library(forcats)
library(dplyr)
library(tictoc)
tic()
mtcars %>%
mutate(cyl = fct_relabel(as.factor(cyl), ~paste(unique(cyl), as.character("cylinder"))),
vs = fct_relabel(as.factor(vs), ~paste(unique(vs), as.character(c("V shaped", "straight")))),
am = fct_relabel(as.factor(am), ~paste(unique(am), as.character(c("Automatic", "Manual")))),
gear = fct_relabel(as.factor(gear), ~paste(unique(gear), as.character("gears"))),
carb = fct_relabel(as.factor(carb), ~paste(unique(carb), as.character("carburetors"))))
toc()
output:
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <fct> <fct> <fct>
1 21 4 cylinder 160 110 3.9 2.62 16.5 0 V shaped 0 Manual 3 gears 3 carburetors
2 21 4 cylinder 160 110 3.9 2.88 17.0 0 V shaped 0 Manual 3 gears 3 carburetors
3 22.8 6 cylinder 108 93 3.85 2.32 18.6 1 straight 0 Manual 3 gears 4 carburetors
4 21.4 4 cylinder 258 110 3.08 3.22 19.4 1 straight 1 Automatic 4 gears 4 carburetors
5 18.7 8 cylinder 360 175 3.15 3.44 17.0 0 V shaped 1 Automatic 4 gears 1 carburetors
6 18.1 4 cylinder 225 105 2.76 3.46 20.2 1 straight 1 Automatic 4 gears 4 carburetors
7 14.3 8 cylinder 360 245 3.21 3.57 15.8 0 V shaped 1 Automatic 4 gears 3 carburetors
8 24.4 6 cylinder 147. 62 3.69 3.19 20 1 straight 1 Automatic 3 gears 1 carburetors
9 22.8 6 cylinder 141. 95 3.92 3.15 22.9 1 straight 1 Automatic 3 gears 1 carburetors
10 19.2 4 cylinder 168. 123 3.92 3.44 18.3 1 straight 1 Automatic 3 gears 3 carburetors
# ... with 22 more rows
> toc()
0.04 sec elapsed