R - Convert character columns with $ and % signs into numeric-CodePudding

I have a dataframe df with multiple columns of which I want to clean up some pricing columns. The dataframe looks as follows:

Col1(char)  Col2(char)     Col3(char)     Col4(char)
CST         $ 128,412.00   $ 0.034         149.628%
FSD         $ 138,232.40   $ 0.023         124.244%
SDD         $ 112,234.45   $ 0.023        -123.324%

However, I would like the output to look as follows:

Col1(char)  Col2(num)   Col3(num)  Col4(num)
CST         128412.00   0.034      1.49628
FSD         138232.40   0.023      1.24244
SDD         112234.45   0.023      -1.23324

How do I transform Col2 - Col4 into numeric columns as elegantly as possible? Thank you!

CodePudding user response：

dat <- structure(list(Col1 = c("CST", "FSD", "SDD"), Col2 = c("$ 128,412.00", 
"$ 138,232.40", "$ 112,234.45"), Col3 = c("$ 0.034", "$ 0.023", 
"$ 0.023"), Col4 = c(" 149.628%", " 124.244%", "-123.324%")),
 class = "data.frame", row.names = c(NA, -3L))
#  Col1         Col2    Col3      Col4
#1  CST $ 128,412.00 $ 0.034  149.628%
#2  FSD $ 138,232.40 $ 0.023  124.244%
#3  SDD $ 112,234.45 $ 0.023 -123.324%

To convert all columns but column 1 to numeric, you can do

tonum <- function (x) {
  ## delete "$", "," and "%" and convert string to numeric
  num <- as.numeric(gsub("[$,%]", "", x))
  ## watch out for "%", that is, 90% should be 90 / 100 = 0.9
  if (grepl("%", x[1])) num <- num / 100
  ## return
  num
}

dat[-1] <- lapply(dat[-1], tonum)
dat
#  Col1     Col2  Col3     Col4
#1  CST 128412.0 0.034  1.49628
#2  FSD 138232.4 0.023  1.24244
#3  SDD 112234.4 0.023 -1.23324

Remark:

I just learned readr::parse_number() from PaulS's answer. It is an interesting function. Basically it removes everything that can not be a valid part of a number. As a practice, I implement the same logic using REGEX. So here is a general-purpose tonum().

tonum <- function (x, regex = TRUE) {
  ## drop everything that is not " /-", "0-9" or "."
  ## then convert string to numeric
  if (regex) {
    num <- as.numeric(stringr::str_remove_all(x, "[^ \\-0-9\\.]*"))
  } else {
    num <- readr::parse_number(x)
  }
  ## watch out for "%", that is, 90% should be 90 / 100 = 0.9
  ind <- grepl("%", x)
  num[ind] <- num[ind] / 100
  ## return
  num
}

Here is a quick test:

x <- unlist(dat[-1], use.names = FALSE)
x <- c(x, "euro 300.95", "RMB 888.66", "£1999.98")
# [1] "$ 128,412.00" "$ 138,232.40" "$ 112,234.45" "$ 0.034"      "$ 0.023"     
# [6] "$ 0.023"      " 149.628%"    " 124.244%"    "-123.324%"    "euro 300.95" 
#[11] "RMB 888.66"   "£1999.98"  

tonum(x, regex = TRUE)
# [1] 128412.00000 138232.40000 112234.45000      0.03400      0.02300
# [6]      0.02300      1.49628      1.24244     -1.23324    300.95000
#[11]    888.66000   1999.98000

tonum(x, regex = FALSE)
# [1] 128412.00000 138232.40000 112234.45000      0.03400      0.02300
# [6]      0.02300      1.49628      1.24244     -1.23324    300.95000
#[11]    888.66000   1999.98000

CodePudding user response：

Another possible solution, based on readr::parse_number (using @ ZheyuanLi's data, to whom I thank):

library(tidyverse)

dat %>%
  mutate(across(-1, ~ parse_number(.x)),
         Col4 = Col4 / 100)

#>   Col1     Col2  Col3     Col4
#> 1  CST 128412.0 0.034  1.49628
#> 2  FSD 138232.4 0.023  1.24244
#> 3  SDD 112234.4 0.023 -1.23324

CodePudding user response：

Another approach using tidyverse

library(dplyr)
library(stringr)

# generating Col5, Col6 same as Col4, just for demo
dat <- data.frame(
  stringsAsFactors = FALSE,
  Col1 = c("CST", "FSD", "SDD"),
  Col2 = c("$ 128,412.00", "$ 138,232.40", "$ 112,234.45"),
  Col3 = c("$ 0.034", "$ 0.023", "$ 0.023"),
  Col4 = c(" 149.628%", " 124.244%", "-123.324%"),
  Col5 = c(" 149.628%", " 124.244%", "-123.324%"),
  Col6 = c(" 149.628%", " 124.244%", "-123.324%")
)


dat %>% 
  mutate(
    across(Col2:Col6,  ~ as.numeric(str_remove_all(.x, pattern = "[$,  %]"))),
    across(Col4:Col6, ~ .x/100)
  )
#>   Col1     Col2  Col3     Col4     Col5     Col6
#> 1  CST 128412.0 0.034  1.49628  1.49628  1.49628
#> 2  FSD 138232.4 0.023  1.24244  1.24244  1.24244
#> 3  SDD 112234.4 0.023 -1.23324 -1.23324 -1.23324

^{Created on 2022-07-12 by the reprex package (v2.0.1)}