Home > database >  How can I convert a character variable with greater than 3 digits to numeric in r?
How can I convert a character variable with greater than 3 digits to numeric in r?

Time:09-10

I would like to convert variable v1 from character to numeric. The values in v1 are numbers.

I tried:

v1 <- as.numeric(v1)

This changes the variable to numeric but changes all values with >3 digits to NA.

> dput(dat)
structure(list(X = c("Baldwin.County", "Banks.County", "Barrow.County", 
"Bibb.County", "Butts.County", "Clarke.County", "Columbia.County", 
"Dawson.County", "DeKalb.County", "Elbert.County", "Forsyth.County", 
"Franklin.County", "Glascock.County", "Greene.County", 
"Gwinnett.County", 
"Habersham.County", "Hall.County", "Hancock.County", "Hart.County", 
"Henry.County", "Jackson.County", "Jasper.County", "Jefferson.County", 
"Jones.County", "Lamar.County", "Lincoln.County", "Lumpkin.County", 
"McDuffie.County", "Madison.County", "Monroe.County", "Morgan.County", 
"Newton.County", "Oconee.County", "Oglethorpe.County", "Putnam.County", 
"Rabun.County", "Richmond.County", "Rockdale.County", "Spalding.County", 
"Stephens.County", "Taliaferro.County", "Towns.County", "Union.County", 
"Walton.County", "Warren.County", "Washington.County", "White.County", 
"Wilkes.County", "Wilkinson.County"), total = c("11,936", "333", 
"6,285", "50,801", "4,767", "21,606", "17,549", "117", "270,370", 
"3,719", "5,508", "1,138", "185", "3,913", "159,130", "910", 
"9,417", "4,687", "3,579", "65,739", "3,400", "2,037", "5,398", 
"4,981", "3,210", "1,698", "366", "5,394", "1,757", "4,124", 
"3,117", "30,641", "1,312", "1,799", "3,864", "252", "72,566", 
"32,136", "14,073", "1,840", "742", "108", "155", "10,220", "2,221", 
"7,293", "605", "2,801", "2,358"), malet = c("5,996", "166", 
"2,957", "22,113", "2,889", "9,160", "8,268", "105", "118,932", 
"1,688", "2,536", "511", " 54", "1,661", "71,095", "255", "4,410", 
"2,605", "1,728", "28,442", "1,810", "960", "2,378", "2,358", 
"1,426", "709", "178", "2,358", "916", "1,928", "1,325", "13,197", 
"684", "820", "1,830", "209", "32,360", "13,739", "6,127", "852", 
"358", " 41", "123", "4,545", "1,031", "3,528", "157", "1,255", 
"1,089"), m1 = c("1,164", " 63", "476", "4,144", "1,050", "2,017", 
"520", " 29", "13,043", "382", "130", "63", " 41", "365", "4,129", 
" 35", "820", "1,134", "293", "2,430", "351", "215", "470", "180", 
"156", "188", " 28", "630", "249", "606", "301", "1,681", "123", 
"216", "206", " 49", "5,876", "1,012", "1,358", "53", " 97", 
"  0", " 29", "954", "377", "896", " 48", "283", "94")), class = 
"data.frame", row.names = c(NA, -49L))

CodePudding user response:

You can do this for all variables that contain numbers (but are not of type numeric) thus:

library(dplyr)
library(readr)
dat %>%
  mutate(across(where( ~ any(str_detect(.,","))), ~ parse_number(.)))
                   X  total  malet    m1
1     Baldwin.County  11936   5996  1164
2       Banks.County    333    166    63
3      Barrow.County   6285   2957   476
4        Bibb.County  50801  22113  4144
5       Butts.County   4767   2889  1050
6      Clarke.County  21606   9160  2017
7    Columbia.County  17549   8268   520
8      Dawson.County    117    105    29
9      DeKalb.County 270370 118932 13043
10     Elbert.County   3719   1688   382
11    Forsyth.County   5508   2536   130
12   Franklin.County   1138    511    63
13   Glascock.County    185     54    41
14     Greene.County   3913   1661   365
15   Gwinnett.County 159130  71095  4129
16  Habersham.County    910    255    35
17       Hall.County   9417   4410   820
18    Hancock.County   4687   2605  1134
19       Hart.County   3579   1728   293
20      Henry.County  65739  28442  2430
21    Jackson.County   3400   1810   351
22     Jasper.County   2037    960   215
23  Jefferson.County   5398   2378   470
24      Jones.County   4981   2358   180
25      Lamar.County   3210   1426   156
26    Lincoln.County   1698    709   188
27    Lumpkin.County    366    178    28
28   McDuffie.County   5394   2358   630
29    Madison.County   1757    916   249
30     Monroe.County   4124   1928   606
31     Morgan.County   3117   1325   301
32     Newton.County  30641  13197  1681
33     Oconee.County   1312    684   123
34 Oglethorpe.County   1799    820   216
35     Putnam.County   3864   1830   206
36      Rabun.County    252    209    49
37   Richmond.County  72566  32360  5876
38   Rockdale.County  32136  13739  1012
39   Spalding.County  14073   6127  1358
40   Stephens.County   1840    852    53
41 Taliaferro.County    742    358    97
42      Towns.County    108     41     0
43      Union.County    155    123    29
44     Walton.County  10220   4545   954
45     Warren.County   2221   1031   377
46 Washington.County   7293   3528   896
47      White.County    605    157    48
48     Wilkes.County   2801   1255   283
49  Wilkinson.County   2358   1089    94

Alternatively, you can first remove the comma and then explicitly convert to numeric, thus:

dat %>%
  mutate(across(where( ~ any(str_detect(.,","))), ~ as.numeric(sub(",", "", .))))

If you add %>% str() to the pipe you will see that all three variables that contain numbers have been converted to numeric.

If you want to convert to numeric just one particular column:

dat %>%
  mutate(m1 = parse_number(m1))

or:

dat %>%
  mutate(m1 = as.numeric(sub(",", "", m1)))
  • Related