Home > Enterprise >  ways to remove N/As from the columns in R
ways to remove N/As from the columns in R

Time:07-01

I am trying to convert the character column values to numeric, so I can divide one column by another later on. I get N/A values. I figured it might be because of commas. I tried to use the following code:

col1 <- c("L1","L2","L3","L4","L5" )
col2 <- c("910", "458", "34,613" , "201" , "1,886")
col3 <- c("87,282","41,304", "5,146,982", "348,520", "27,274")
df <- data.frame(col1, col2, col3, stringsAsFactors = FALSE)
df$col2 <-as.factor(df$col2)
df$col3 <-as.factor(df$col3)
#Convert chr to numeric
df[,'col2'] <- as.numeric(as.character(df[,'col2']))
#> Warning: NAs introduced by coercion
df[,'col3'] <- as.numeric(as.character(df[,'col3']))
#> Warning: NAs introduced by coercion
#try to get rid of commas
gsub(",", "", df$col3)
#> [1] NA NA NA NA NA
df$new <- df$col3/df$col2
Created on 2022-06-30 by the reprex package (v2.0.1)

I also tried:

df[,'col2'] <- as.numeric(as.character(df[,'col2']))
#> Warning: NAs introduced by coercion
as.numeric(gsub(",", "", df$col3))
#> [1] NA NA NA NA NA

<sup>Created on 2022-06-30 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)</sup>

I also tried this way, which does not produce N/As, but still has commas:

 setClass("num.with.commas")
setAs("character", "num.with.commas", 
      function(from) as.numeric(gsub(",", "", from) ) )
colClasses=c('num.with.commas','factor','character','numeric','num.with.commas')
#it does not remove commas, but it has no N/As

Created on 2022-06-30 by the reprex package (v2.0.1)

And the last effort which produced only errors:

 dft %>%
  mutate_all(funs(as.character(.)), col2, col3) %>%
  mutate_all(funs(gsub(",", "", .)), col2, col3) %>%
  mutate_all(funs(as.numeric(.)), col2, col3)
#> Error in dft %>% mutate_all(funs(as.character(.)), col2, col3) %>% mutate_all(funs(gsub(

CodePudding user response:

You could use parse_number specifying the relevant columns in across (which could also be negated, e.g. -col1):

(%>% may be used as an alternative to |>.)

library(tidyverse)

col1 <- c("L1","L2","L3","L4","L5" )
col2 <- c("910", "458", "34,613" , "201" , "1,886")
col3 <- c("87,282","41,304", "5,146,982", "348,520", "27,274")
df <- data.frame(col1, col2, col3, stringsAsFactors = FALSE)


df |> 
  mutate(across(c(col2, col3), parse_number))
#>   col1  col2    col3
#> 1   L1   910   87282
#> 2   L2   458   41304
#> 3   L3 34613 5146982
#> 4   L4   201  348520
#> 5   L5  1886   27274

Created on 2022-06-30 by the reprex package (v2.0.1)

CodePudding user response:

A possible solution:

library(tidyverse)

col1 <- c("L1","L2","L3","L4","L5" )
col2 <- c("910", "458", "34,613" , "201" , "1,886")
col3 <- c("87,282","41,304", "5,146,982", "348,520", "27,274")
df <- data.frame(col1, col2, col3)

df %>% 
  mutate(across(-1, ~ str_remove(.x, ","))) %>% 
  type.convert(as.is = T)
#>   col1  col2     col3
#> 1   L1   910    87282
#> 2   L2   458    41304
#> 3   L3 34613 5146,982
#> 4   L4   201   348520
#> 5   L5  1886    27274

Or in base R:

data.frame(type.convert(lapply(df, \(x) gsub(",", "", x)), as.is = T))

CodePudding user response:

Another dplyr way: Here we remove all special characters and wrap it around is.numeric

library(dplyr)

df %>% 
  mutate(across(-col1, ~as.numeric(gsub("[[:punct:]]", "", .))))
  col1  col2    col3
1   L1   910   87282
2   L2   458   41304
3   L3 34613 5146982
4   L4   201  348520
5   L5  1886   27274

CodePudding user response:

You were close with the gsub. Try this small function,

clean_num <- \(x) as.numeric(as.character(gsub('[,\\$\']', '', x)))

it removes everything you define in the brackets (special chars need to be escaped).

clean_num(df$col2)
# [1]   910   458 34613   201  1886

To use it on defined columns of a data frame, do

cols <- c('col2', 'col3')
df[cols] <- lapply(df[cols], clean_num)
df
#   col1  col2    col3
# 1   L1   910   87282
# 2   L2   458   41304
# 3   L3 34613 5146982
# 4   L4   201  348520
# 5   L5  1886   27274

Data:

df <- structure(list(col1 = c("L1", "L2", "L3", "L4", "L5"), col2 = structure(c(1L, 
5L, 4L, 3L, 2L), levels = c("$910", "1,886", "201", "34,613", 
"458"), class = "factor"), col3 = structure(c(5L, 3L, 4L, 2L, 
1L), levels = c("27,274", "348,520", "41,304", "5,146,982", "87,282"
), class = "factor")), row.names = c(NA, -5L), class = "data.frame")
  • Related