I need convert all values that are "-" in my data frame to 0 however the "-" may have leading or trailing spaces so I'm using:
DF_To_Process[ trimws(DF_To_Process) == "-"] = 0
it seems to do the job but it's very slow plus I have repeat for several different instances such as "." and "N/A". Is there a faster way?
CodePudding user response:
Does your code work on the whole dataframe? Or only on one column at a time? If you're only looking at a single column, this approach may be a bit faster:
library(microbenchmark)
library(ggplot2)
# generate some 'test' data (700k rows, 1 col)
df <- data.frame(column = rep(c("fine", " - ", " -", "- ", ".", "N/A", "fine"), 1e5))
# Test current method:
df2 <- df
df2[trimws(df2) == "-",] = 0
head(df2, 9)
#> column
#> 1 fine
#> 2 -
#> 3 -
#> 4 -
#> 5 .
#> 6 N/A
#> 7 fine
#> 8 fine
#> 9 -
# This doesn't appear to have worked...
# try on one column at a time
trimws_func <- function(df) {
df2 <- df
df2$column[trimws(df2$column) == "-"] = 0
df2$column[trimws(df2$column) == "."] = 0
df2$column[trimws(df2$column) == "N/A"] = 0
return(df2)
}
ifelse_func <- function(df) {
df2 <- df
df2$column <- ifelse(grepl("\\s*-\\s*|\\s*\\.\\s*|\\s*N/A\\s*",
df$column, perl = TRUE),
0, df2$column)
return(df2)
}
head(trimws_func(df))
#> column
#> 1 fine
#> 2 0
#> 3 0
#> 4 0
#> 5 0
#> 6 0
head(ifelse_func(df))
#> column
#> 1 fine
#> 2 0
#> 3 0
#> 4 0
#> 5 0
#> 6 0
res <- microbenchmark(trimws_func(df), ifelse_func(df), times = 8)
res
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> trimws_func(df) 853.3805 878.1754 889.3538 887.9099 891.2959 946.6870 8
#> ifelse_func(df) 296.7631 308.3129 328.5878 318.1445 350.8522 377.3202 8
#> cld
#> b
#> a
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Created on 2022-10-06 by the reprex package (v2.0.1)