I have some variables with are percentage values of their neighbour columns. I would like to turn them into numerics. I found some function to detect them either by column name or the percentage sign and managed to mutate
single column with parse_number
. How can I extend or simplify this to all columns with percentage values?
data <- data.frame(
stringsAsFactors = FALSE,
check.names = FALSE,
row.names = c("2", "3", "4", "5", "6", "7"),
Year = c("1992",
"1993","1994","1995","1996","1997"),
gesamt = c("472", "997", "1443", "1810", "2321", "2835"),
weiblich = c("236", "546", "724", "949", "1242", "1584"),
`weiblich inProzent` = c("50,0%",
"54,8%","50,2%","52,4%","53,5%","55,9%"),
Deutsche = c("325", "598", "841", "1030", "1348", "1662"),
`Deutsche inProzent` = c("68,9%",
"60,0%","58,3%","56,9%","58,1%","58,6%"),
`Ausländer/innengesamt` = c("169", "399", "602", "780", "973", "1173"),
`Ausländer/innenin Prozent` = c("35,8%",
"40,0%","41,7%","43,1%","41,9%","41,4%"),
davonPolen = c("167", "384", "566", "731", "883", "1053"),
`Polenin Prozent` = c("35,4%",
"38,5%","39,2%","40,4%","38,0%","37,1%")
)
data %>%
select(contains("Prozent")) %>%
mutate(fem_pct = parse_number(`weiblich inProzent`))
CodePudding user response:
df %>%
mutate(across(everything(), ~ ifelse(str_detect(.x, "%"),
parse_number(.x) / 10,
.x)))
# A tibble: 6 x 10
Year gesamt weiblich `weiblich inProzent` Deutsche Deutsch~1 Auslä~2 Auslä~3 davon~4 Polen~5
<chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 1992 472 236 50 325 68.9 169 35.8 167 35.4
2 1993 997 546 54.8 598 60 399 40 384 38.5
3 1994 1443 724 50.2 841 58.3 602 41.7 566 39.2
4 1995 1810 949 52.4 1030 56.9 780 43.1 731 40.4
5 1996 2321 1242 53.5 1348 58.1 973 41.9 883 38
6 1997 2835 1584 55.9 1662 58.6 1173 41.4 1053 37.1
# ... with abbreviated variable names 1: `Deutsche inProzent`, 2: `Ausländer/innengesamt`,
# 3: `Ausländer/innenin Prozent`, 4: davonPolen, 5: `Polenin Prozent`
Or if you want only parse_number
df %>%
mutate(across(everything(), parse_number))
# A tibble: 6 x 10
Year gesamt weiblich `weiblich inProzent` Deutsche Deutsch~1 Auslä~2 Auslä~3 davon~4 Polen~5
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1992 472 236 500 325 689 169 358 167 354
2 1993 997 546 548 598 600 399 400 384 385
3 1994 1443 724 502 841 583 602 417 566 392
4 1995 1810 949 524 1030 569 780 431 731 404
5 1996 2321 1242 535 1348 581 973 419 883 380
6 1997 2835 1584 559 1662 586 1173 414 1053 371
# ... with abbreviated variable names 1: `Deutsche inProzent`, 2: `Ausländer/innengesamt`,
# 3: `Ausländer/innenin Prozent`, 4: davonPolen, 5: `Polenin Prozent`
CodePudding user response:
Using stringi::stri_replace_all_regex
, which allows for multiple patterns/replacements and is fast. Also used type.convert
to convert columns into appropriate numeric format.
data[] <- sapply(data, stringi::stri_replace_all_regex, pattern=c('%', ','),
replacement=c('', '.'), vectorize=FALSE) |> type.convert(as.is=TRUE)
head(data, 3)
# Year gesamt weiblich weiblich inProzent Deutsche
# 2 1992 472 236 50.0 325
# 3 1993 997 546 54.8 598
# 4 1994 1443 724 50.2 841
# Deutsche inProzent Ausländer/innengesamt
# 2 68.9 169
# 3 60.0 399
# 4 58.3 602
# Ausländer/innenin Prozent davonPolen Polenin Prozent
# 2 35.8 167 35.4
# 3 40.0 384 38.5
# 4 41.7 566 39.2
CodePudding user response:
Using base R
data[] <- lapply(data, chartr, old = ',%', new = '. ') |>
type.convert(as.is = TRUE)
-output
> str(data)
'data.frame': 6 obs. of 10 variables:
$ Year : int 1992 1993 1994 1995 1996 1997
$ gesamt : int 472 997 1443 1810 2321 2835
$ weiblich : int 236 546 724 949 1242 1584
$ weiblich inProzent : num 50 54.8 50.2 52.4 53.5 55.9
$ Deutsche : int 325 598 841 1030 1348 1662
$ Deutsche inProzent : num 68.9 60 58.3 56.9 58.1 58.6
$ Ausländer/innengesamt : int 169 399 602 780 973 1173
$ Ausländer/innenin Prozent: num 35.8 40 41.7 43.1 41.9 41.4
$ davonPolen : int 167 384 566 731 883 1053
$ Polenin Prozent : num 35.4 38.5 39.2 40.4 38 37.1
> head(data)
Year gesamt weiblich weiblich inProzent Deutsche Deutsche inProzent Ausländer/innengesamt Ausländer/innenin Prozent davonPolen Polenin Prozent
2 1992 472 236 50.0 325 68.9 169 35.8 167 35.4
3 1993 997 546 54.8 598 60.0 399 40.0 384 38.5
4 1994 1443 724 50.2 841 58.3 602 41.7 566 39.2
5 1995 1810 949 52.4 1030 56.9 780 43.1 731 40.4
6 1996 2321 1242 53.5 1348 58.1 973 41.9 883 38.0
7 1997 2835 1584 55.9 1662 58.6 1173 41.4 1053 37.1