Home > Back-end >  Turn multiple char columns with percentage sign and comma decimal mark into numerics
Turn multiple char columns with percentage sign and comma decimal mark into numerics

Time:09-26

I have some variables with are percentage values of their neighbour columns. I would like to turn them into numerics. I found some function to detect them either by column name or the percentage sign and managed to mutate single column with parse_number. How can I extend or simplify this to all columns with percentage values?

data <- data.frame(
             stringsAsFactors = FALSE,
                  check.names = FALSE,
                    row.names = c("2", "3", "4", "5", "6", "7"),
                         Year = c("1992",
                                  "1993","1994","1995","1996","1997"),
                       gesamt = c("472", "997", "1443", "1810", "2321", "2835"),
                     weiblich = c("236", "546", "724", "949", "1242", "1584"),
         `weiblich inProzent` = c("50,0%",
                                  "54,8%","50,2%","52,4%","53,5%","55,9%"),
                     Deutsche = c("325", "598", "841", "1030", "1348", "1662"),
         `Deutsche inProzent` = c("68,9%",
                                  "60,0%","58,3%","56,9%","58,1%","58,6%"),
      `Ausländer/innengesamt` = c("169", "399", "602", "780", "973", "1173"),
  `Ausländer/innenin Prozent` = c("35,8%",
                                  "40,0%","41,7%","43,1%","41,9%","41,4%"),
                   davonPolen = c("167", "384", "566", "731", "883", "1053"),
            `Polenin Prozent` = c("35,4%",
                                  "38,5%","39,2%","40,4%","38,0%","37,1%")
)

data %>% 
  select(contains("Prozent")) %>% 
  mutate(fem_pct = parse_number(`weiblich inProzent`))

CodePudding user response:

df %>%  
  mutate(across(everything(), ~ ifelse(str_detect(.x, "%"),
                                       parse_number(.x) / 10, 
                                       .x)))

# A tibble: 6 x 10
  Year  gesamt weiblich `weiblich inProzent` Deutsche Deutsch~1 Auslä~2 Auslä~3 davon~4 Polen~5
  <chr> <chr>  <chr>                   <dbl> <chr>        <dbl> <chr>     <dbl> <chr>     <dbl>
1 1992  472    236                      50   325           68.9 169        35.8 167        35.4
2 1993  997    546                      54.8 598           60   399        40   384        38.5
3 1994  1443   724                      50.2 841           58.3 602        41.7 566        39.2
4 1995  1810   949                      52.4 1030          56.9 780        43.1 731        40.4
5 1996  2321   1242                     53.5 1348          58.1 973        41.9 883        38  
6 1997  2835   1584                     55.9 1662          58.6 1173       41.4 1053       37.1
# ... with abbreviated variable names 1: `Deutsche inProzent`, 2: `Ausländer/innengesamt`,
#   3: `Ausländer/innenin Prozent`, 4: davonPolen, 5: `Polenin Prozent`

Or if you want only parse_number

df %>%  
  mutate(across(everything(), parse_number))

# A tibble: 6 x 10
   Year gesamt weiblich `weiblich inProzent` Deutsche Deutsch~1 Auslä~2 Auslä~3 davon~4 Polen~5
  <dbl>  <dbl>    <dbl>                <dbl>    <dbl>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1  1992    472      236                  500      325       689     169     358     167     354
2  1993    997      546                  548      598       600     399     400     384     385
3  1994   1443      724                  502      841       583     602     417     566     392
4  1995   1810      949                  524     1030       569     780     431     731     404
5  1996   2321     1242                  535     1348       581     973     419     883     380
6  1997   2835     1584                  559     1662       586    1173     414    1053     371
# ... with abbreviated variable names 1: `Deutsche inProzent`, 2: `Ausländer/innengesamt`,
#   3: `Ausländer/innenin Prozent`, 4: davonPolen, 5: `Polenin Prozent`

CodePudding user response:

Using stringi::stri_replace_all_regex, which allows for multiple patterns/replacements and is fast. Also used type.convert to convert columns into appropriate numeric format.

data[] <- sapply(data, stringi::stri_replace_all_regex, pattern=c('%', ','),
                 replacement=c('', '.'), vectorize=FALSE) |> type.convert(as.is=TRUE)
head(data, 3)
#   Year gesamt weiblich weiblich inProzent Deutsche
# 2 1992    472      236               50.0      325
# 3 1993    997      546               54.8      598
# 4 1994   1443      724               50.2      841
#   Deutsche inProzent Ausländer/innengesamt
# 2               68.9                   169
# 3               60.0                   399
# 4               58.3                   602
#   Ausländer/innenin Prozent davonPolen Polenin Prozent
# 2                      35.8        167            35.4
# 3                      40.0        384            38.5
# 4                      41.7        566            39.2

CodePudding user response:

Using base R

data[] <- lapply(data, chartr, old = ',%', new = '. ') |> 
          type.convert(as.is = TRUE)

-output

> str(data)
'data.frame':   6 obs. of  10 variables:
 $ Year                     : int  1992 1993 1994 1995 1996 1997
 $ gesamt                   : int  472 997 1443 1810 2321 2835
 $ weiblich                 : int  236 546 724 949 1242 1584
 $ weiblich inProzent       : num  50 54.8 50.2 52.4 53.5 55.9
 $ Deutsche                 : int  325 598 841 1030 1348 1662
 $ Deutsche inProzent       : num  68.9 60 58.3 56.9 58.1 58.6
 $ Ausländer/innengesamt    : int  169 399 602 780 973 1173
 $ Ausländer/innenin Prozent: num  35.8 40 41.7 43.1 41.9 41.4
 $ davonPolen               : int  167 384 566 731 883 1053
 $ Polenin Prozent          : num  35.4 38.5 39.2 40.4 38 37.1
> head(data)
  Year gesamt weiblich weiblich inProzent Deutsche Deutsche inProzent Ausländer/innengesamt Ausländer/innenin Prozent davonPolen Polenin Prozent
2 1992    472      236               50.0      325               68.9                   169                      35.8        167            35.4
3 1993    997      546               54.8      598               60.0                   399                      40.0        384            38.5
4 1994   1443      724               50.2      841               58.3                   602                      41.7        566            39.2
5 1995   1810      949               52.4     1030               56.9                   780                      43.1        731            40.4
6 1996   2321     1242               53.5     1348               58.1                   973                      41.9        883            38.0
7 1997   2835     1584               55.9     1662               58.6                  1173                      41.4       1053            37.1
  • Related