I have this data frame
head(df)
## patnum hospstay lowph pltct race bwt gest inout twn lol magsulf
## 1 1 34 NA 100 white 1250 35 born at duke 0 NA NA
## 2 2 9 7.250000 244 white 1370 32 born at duke 0 NA NA
## 3 3 -2 7.059998 114 black 620 23 born at duke 0 NA NA
## 4 4 40 7.250000 182 black 1480 32 born at duke 0 NA NA
## 5 5 2 6.969997 54 black 925 28 born at duke 0 NA NA
## 6 6 62 7.189999 NA white 940 28 born at duke 0 NA NA
## meth toc delivery apg1 vent pneumo pda cld sex dead
## 1 0 0 abdominal 8 0 0 0 0 female 0
## 2 1 0 abdominal 7 0 0 0 0 female 0
## 3 0 1 vaginal 1 1 0 0 NA female 1
## 4 1 0 vaginal 8 0 0 0 0 male 0
## 5 0 0 abdominal 5 1 1 0 0 female 1
## 6 1 0 abdominal 8 1 0 0 0 female 0
The race variable has 4 entries, "white", "black", "native american", "oriental". I am wanting to replace this string with capitalized versions "White", "Black", "Native American", "Oriental". I would like to do this using the substr function. I'm not sure what code to use to accomplish this. I was provided an example below, where the
day_full = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
substr(day_full_1, nchar(day_full_1)-2, nchar(day_full_1)) = "DAY"
The result is: "SunDAY", "MonDAY", "TuesDAY", "WednesDAY", "ThursDAY", "FriDAY", "SaturDAY", "SunDAY"
This is similar to what I want to do, but I only want to have the first letter of each of the races to be capitalized. How would I translate this to make each first letter of the 4 races capital?
This is the solution I've tried now.
substr(SB_xlsx$race, 1, 1) <- toupper(substr(SB_xlsx$race, 1, 1))
substr(SB_xlsx$race, 1, 1)
## structure(list(patnum = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
## 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), hospstay = c(34,
## 9, -2, 40, 2, 62, 32, NA, NA, 28, 38, NA, 62, 69, 1, 93, 44,
## 50, 66, 65, 44, 70, 85, NA), lowph = c(NA, 7.25, 7.059998, 7.25,
## 6.969997, 7.189999, 7.32, NA, NA, 7.16, 7.039997, NA, 7.179996,
## 7.419998, 7.119999, 7.239998, 7.129997, 7.269997, 7.179996, 7.07,
## 7.289997, 7.129997, 7.189999, NA), pltct = c(100, 244, 114, 182,
## 54, NA, 282, NA, NA, 153, 229, NA, 182, 361, 378, 255, 186, NA,
## 260, 183, 134, 229, 68, NA), race = c("white", "white", "black",
## "black", "black", "white", "black", NA, NA, "black", "white",
## NA, "black", "white", "white", "black", "white", "black", "black",
## "white", "white", "black", "white", NA), bwt = c(1250, 1370,
## 620, 1480, 925, 940, 1255, 600, 700, 1350, 1310, 550, 1110, 1180,
## 970, 770, 1490, 1170, 1360, 1330, 1000, 1120, 740, NA), gest = c(35,
## 32, 23, 32, 28, 28, 29.5, 26, 24, 34, 32, 24, 28, 28, 28, 26,
## 33, 31, 31, 31, 28, 29, 26, NA), inout = c("born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", NA), twn = c(0, 0, 0, 0, 0, 0, 0, NA, NA, 0,
## 0, NA, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, NA), lol = c(NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA), meth = c(0, 1, 0, 1, 0, 1, 1, NA, NA, 1, 0, NA, 0, 0,
## 1, 1, 1, 1, 1, 1, 0, 1, 0, NA), toc = c(0, 0, 1, 0, 0, 0, 0,
## NA, NA, 0, 0, NA, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, NA), delivery = c("abdominal",
## "abdominal", "vaginal", "vaginal", "abdominal", "abdominal",
## "vaginal", NA, NA, "abdominal", "vaginal", NA, "vaginal", "abdominal",
## "vaginal", "vaginal", "abdominal", "vaginal", "vaginal", "vaginal",
## "vaginal", "vaginal", "abdominal", NA), apg1 = c(8, 7, 1, 8,
## 5, 8, 9, NA, NA, 4, 6, NA, 6, 6, 2, 4, 8, 7, 1, 8, 5, 9, 9, NA
## ), vent = c(0, 0, 1, 0, 1, 1, 0, NA, NA, 0, 1, NA, 0, 0, 1, 1,
## 0, 0, 1, 1, 0, 1, 0, NA), pneumo = c(0, 0, 0, 0, 1, 0, 0, NA,
## NA, 0, 0, NA, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, NA), pda = c(0,
## 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,
## 0, 0, NA), cld = c(0, 0, NA, 0, 0, 0, 0, NA, NA, 0, 0, NA, 1,
## 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, NA), sex = c("female", "female",
## "female", "male", "female", "female", "female", NA, NA, "female",
## "male", NA, "male", "male", "female", "male", "male", "female",
## "male", "male", "female", "female", "female", NA), dead = c(0,
## 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
## 0, NA)), class = "data.frame", row.names = c(NA, -24L))
CodePudding user response:
Two solutions:
df <- structure(list(patnum = 1:6, hospstay = c(34L, 9L, -2L, 40L, 2L, 62L), lowph = c(NA, 7.25, 7.059998, 7.25, 6.969997, 7.189999), pltct = c(100L, 244L, 114L, 182L, 54L, NA), race = c("white", "white", "black", "black", "black", "white"), bwt = c(1250L, 1370L, 620L, 1480L, 925L, 940L), gest = c(35L, 32L, 23L, 32L, 28L, 28L), inout = c("born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke"), twn = c(0L, 0L, 0L, 0L, 0L, 0L), lol = c(NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
tools::toTitleCase(df$race)
# [1] "White" "White" "Black" "Black" "Black" "White"
But those are simpler with no spaces, let's create one for this exercise:
vec <- c("white", "black", "native american")
tools::toTitleCase(vec)
# [1] "White" "Black" "Native American"
We can also use gregexpr
/regmatches
to do it:
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre)
# [[1]]
# [1] "w"
# [[2]]
# [1] "b"
# [[3]]
# [1] "n" "a"
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
# [1] "White" "Black" "Native American"
I'm sure there's a stringr
-variant out there as well.
As for substr
, it's feasible to use regex to find all (1) first-chars and (2) all chars that follow a space, then extract each one, then toupper
-them, then put that back into place ... but at that point you're still using regex and effectively doing what toTitleCase
is doing natively and what this gregexpr
/regmatches
code is doing a little more verbosely.
If all you wanted to do was replace the first character, though, and not care about letters after spaces, then
substr(vec, 1, 1) <- toupper(substr(vec, 1, 1))
vec
# [1] "White" "Black" "Native american"
though in this case, I think the lower-case "a"
in "Native american"
is wrong, so I don't think this is the best approach.
Scaling
Since you are concerned about scaling (I'm assuming you're venturing into 100K or more, since less than that is not going to be an issue with any method demonstrated), here's a comparison:
bench::mark(
toTitleCase = tools::toTitleCase(vec),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
}
)
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 401us 474us 1735. 4.15KB 0 868 0 500ms <chr [3]> <Rprofmem [9 x 3]> <bench_~ <tibble~
# 2 gregexpr 111us 205us 5240. 24.28KB 2.26 2315 1 442ms <chr [3]> <Rprofmem [6 x 3]> <bench_~ <tibble~
Granted, vec
size 3 is pretty small, let's scale that up a bit.
vec30000 <- rep(vec, 10000) # 30000 length
bench::mark(
toTitleCase = tools::toTitleCase(vec30000),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000
}
)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 6.01s 6.01s 0.166 36MB 0.832 1 5 6.01s <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
# 2 gregexpr 773.13ms 773.13ms 1.29 241MB 2.59 1 2 773.13ms <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
Looking at the `itr/sec`
column showing iterations per second, it appears that even at scale, the gregexpr
method works better. (If you look at the source code for toTitleCase
, you'll see why: it's consider a lot more than just space-delimited words, it's also consider linking words, exception-words, etc.)
CodePudding user response:
Another way is to use perl substitution:
gsub('\\b(\\w)', '\\U\\1', vec, perl = TRUE)
[1] "White" "Black" "Native American"
This method is way faster (ie 35 times Faster) than the gregexpr
method mentioned before:
microbenchmark::microbenchmark(
gsub = gsub('\\b(\\w)', '\\U\\1', vec30000, perl = TRUE),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000 },
unit = 'relative', check = 'equal')
Unit: relative
expr min lq mean median uq max neval
gsub 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 5
gregexpr 37.37549 41.10014 29.00345 24.49221 25.39978 25.54325 5