Capitalizing the first letter of characters in a column using substr function-CodePudding

I have this data frame

head(df)
##   patnum hospstay    lowph pltct  race  bwt gest        inout twn lol magsulf
## 1      1       34       NA   100 white 1250   35 born at duke   0  NA      NA
## 2      2        9 7.250000   244 white 1370   32 born at duke   0  NA      NA
## 3      3       -2 7.059998   114 black  620   23 born at duke   0  NA      NA
## 4      4       40 7.250000   182 black 1480   32 born at duke   0  NA      NA
## 5      5        2 6.969997    54 black  925   28 born at duke   0  NA      NA
## 6      6       62 7.189999    NA white  940   28 born at duke   0  NA      NA
##   meth toc  delivery apg1 vent pneumo pda cld    sex dead
## 1    0   0 abdominal    8    0      0   0   0 female    0
## 2    1   0 abdominal    7    0      0   0   0 female    0
## 3    0   1   vaginal    1    1      0   0  NA female    1
## 4    1   0   vaginal    8    0      0   0   0   male    0
## 5    0   0 abdominal    5    1      1   0   0 female    1
## 6    1   0 abdominal    8    1      0   0   0 female    0

The race variable has 4 entries, "white", "black", "native american", "oriental". I am wanting to replace this string with capitalized versions "White", "Black", "Native American", "Oriental". I would like to do this using the substr function. I'm not sure what code to use to accomplish this. I was provided an example below, where the

day_full = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")

substr(day_full_1, nchar(day_full_1)-2, nchar(day_full_1)) = "DAY"

The result is: "SunDAY", "MonDAY", "TuesDAY", "WednesDAY", "ThursDAY", "FriDAY", "SaturDAY", "SunDAY"

This is similar to what I want to do, but I only want to have the first letter of each of the races to be capitalized. How would I translate this to make each first letter of the 4 races capital?

This is the solution I've tried now.

substr(SB_xlsx$race, 1, 1) <- toupper(substr(SB_xlsx$race, 1, 1))

substr(SB_xlsx$race, 1, 1) 

## structure(list(patnum = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
## 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), hospstay = c(34, 
## 9, -2, 40, 2, 62, 32, NA, NA, 28, 38, NA, 62, 69, 1, 93, 44, 
## 50, 66, 65, 44, 70, 85, NA), lowph = c(NA, 7.25, 7.059998, 7.25, 
## 6.969997, 7.189999, 7.32, NA, NA, 7.16, 7.039997, NA, 7.179996, 
## 7.419998, 7.119999, 7.239998, 7.129997, 7.269997, 7.179996, 7.07, 
## 7.289997, 7.129997, 7.189999, NA), pltct = c(100, 244, 114, 182, 
## 54, NA, 282, NA, NA, 153, 229, NA, 182, 361, 378, 255, 186, NA, 
## 260, 183, 134, 229, 68, NA), race = c("white", "white", "black", 
## "black", "black", "white", "black", NA, NA, "black", "white", 
## NA, "black", "white", "white", "black", "white", "black", "black", 
## "white", "white", "black", "white", NA), bwt = c(1250, 1370, 
## 620, 1480, 925, 940, 1255, 600, 700, 1350, 1310, 550, 1110, 1180, 
## 970, 770, 1490, 1170, 1360, 1330, 1000, 1120, 740, NA), gest = c(35, 
## 32, 23, 32, 28, 28, 29.5, 26, 24, 34, 32, 24, 28, 28, 28, 26, 
## 33, 31, 31, 31, 28, 29, 26, NA), inout = c("born at duke", "born at duke", 
## "born at duke", "born at duke", "born at duke", "born at duke", 
## "born at duke", "born at duke", "born at duke", "born at duke", 
## "born at duke", "born at duke", "born at duke", "born at duke", 
## "born at duke", "born at duke", "born at duke", "born at duke", 
## "born at duke", "born at duke", "born at duke", "born at duke", 
## "born at duke", NA), twn = c(0, 0, 0, 0, 0, 0, 0, NA, NA, 0, 
## 0, NA, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, NA), lol = c(NA, NA, 
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
## NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA, 
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
## NA, NA), meth = c(0, 1, 0, 1, 0, 1, 1, NA, NA, 1, 0, NA, 0, 0, 
## 1, 1, 1, 1, 1, 1, 0, 1, 0, NA), toc = c(0, 0, 1, 0, 0, 0, 0, 
## NA, NA, 0, 0, NA, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, NA), delivery = c("abdominal", 
## "abdominal", "vaginal", "vaginal", "abdominal", "abdominal", 
## "vaginal", NA, NA, "abdominal", "vaginal", NA, "vaginal", "abdominal", 
## "vaginal", "vaginal", "abdominal", "vaginal", "vaginal", "vaginal", 
## "vaginal", "vaginal", "abdominal", NA), apg1 = c(8, 7, 1, 8, 
## 5, 8, 9, NA, NA, 4, 6, NA, 6, 6, 2, 4, 8, 7, 1, 8, 5, 9, 9, NA
## ), vent = c(0, 0, 1, 0, 1, 1, 0, NA, NA, 0, 1, NA, 0, 0, 1, 1, 
## 0, 0, 1, 1, 0, 1, 0, NA), pneumo = c(0, 0, 0, 0, 1, 0, 0, NA, 
## NA, 0, 0, NA, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, NA), pda = c(0, 
## 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
## 0, 0, NA), cld = c(0, 0, NA, 0, 0, 0, 0, NA, NA, 0, 0, NA, 1, 
## 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, NA), sex = c("female", "female", 
## "female", "male", "female", "female", "female", NA, NA, "female", 
## "male", NA, "male", "male", "female", "male", "male", "female", 
## "male", "male", "female", "female", "female", NA), dead = c(0, 
## 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
## 0, NA)), class = "data.frame", row.names = c(NA, -24L))

CodePudding user response：

Two solutions:

df <- structure(list(patnum = 1:6, hospstay = c(34L, 9L, -2L, 40L, 2L, 62L), lowph = c(NA, 7.25, 7.059998, 7.25, 6.969997, 7.189999), pltct = c(100L, 244L, 114L, 182L, 54L, NA), race = c("white", "white", "black", "black", "black", "white"), bwt = c(1250L, 1370L, 620L, 1480L, 925L, 940L), gest = c(35L, 32L, 23L, 32L, 28L, 28L), inout = c("born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke"), twn = c(0L, 0L, 0L, 0L, 0L, 0L), lol = c(NA, NA, NA, NA, NA, NA), magsulf = c(NA,  NA, NA, NA, NA, NA)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))

tools::toTitleCase(df$race)
# [1] "White" "White" "Black" "Black" "Black" "White"

But those are simpler with no spaces, let's create one for this exercise:

vec <- c("white", "black", "native american")
tools::toTitleCase(vec)
# [1] "White"           "Black"           "Native American"

We can also use gregexpr/regmatches to do it:

gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre)
# [[1]]
# [1] "w"
# [[2]]
# [1] "b"
# [[3]]
# [1] "n" "a"
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
# [1] "White"           "Black"           "Native American"

I'm sure there's a stringr-variant out there as well.

As for substr, it's feasible to use regex to find all (1) first-chars and (2) all chars that follow a space, then extract each one, then toupper-them, then put that back into place ... but at that point you're still using regex and effectively doing what toTitleCase is doing natively and what this gregexpr/regmatches code is doing a little more verbosely.

If all you wanted to do was replace the first character, though, and not care about letters after spaces, then

substr(vec, 1, 1) <- toupper(substr(vec, 1, 1))
vec
# [1] "White"           "Black"           "Native american"

though in this case, I think the lower-case "a" in "Native american" is wrong, so I don't think this is the best approach.

Scaling

Since you are concerned about scaling (I'm assuming you're venturing into 100K or more, since less than that is not going to be an issue with any method demonstrated), here's a comparison:

bench::mark(
  toTitleCase = tools::toTitleCase(vec),
  gregexpr = {
    gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
    regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
    vec
  }
)
# # A tibble: 2 x 13
#   expression       min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result    memory             time     gc      
#   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>    <list>             <list>   <list>  
# 1 toTitleCase    401us    474us     1735.    4.15KB     0      868     0      500ms <chr [3]> <Rprofmem [9 x 3]> <bench_~ <tibble~
# 2 gregexpr       111us    205us     5240.   24.28KB     2.26  2315     1      442ms <chr [3]> <Rprofmem [6 x 3]> <bench_~ <tibble~

Granted, vec size 3 is pretty small, let's scale that up a bit.

vec30000 <- rep(vec, 10000) # 30000 length
bench::mark(
  toTitleCase = tools::toTitleCase(vec30000),
  gregexpr = {
    gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
    regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
    vec30000
  }
)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
#   expression       min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result         memory       time      gc      
#   <bch:expr>  <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>         <list>       <list>    <list>  
# 1 toTitleCase    6.01s    6.01s     0.166      36MB    0.832     1     5      6.01s <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
# 2 gregexpr    773.13ms 773.13ms     1.29      241MB    2.59      1     2   773.13ms <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~

Looking at the `itr/sec` column showing iterations per second, it appears that even at scale, the gregexpr method works better. (If you look at the source code for toTitleCase, you'll see why: it's consider a lot more than just space-delimited words, it's also consider linking words, exception-words, etc.)

CodePudding user response：

Another way is to use perl substitution:

gsub('\\b(\\w)', '\\U\\1', vec, perl = TRUE)
[1] "White"           "Black"           "Native American"

This method is way faster (ie 35 times Faster) than the gregexpr method mentioned before:

 microbenchmark::microbenchmark(
     gsub = gsub('\\b(\\w)', '\\U\\1', vec30000, perl = TRUE),
     gregexpr = {
         gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
         regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
         vec30000 },
     unit = 'relative', check = 'equal')

Unit: relative
     expr      min       lq     mean   median       uq      max neval
     gsub  1.00000  1.00000  1.00000  1.00000  1.00000  1.00000     5
 gregexpr 37.37549 41.10014 29.00345 24.49221 25.39978 25.54325     5