Extract maximum number from a text string-CodePudding

I want to extract the highest numeric value from each string considering that sometimes that string will be an NA, which will be copied as NA.

I've tried the following solution:

df2$val = sapply(strsplit(df2$NomNivelComplej , '\\D '), function(x) max(as.numeric(x)))

But all I get is NA

Data

df2 = structure(list(IdCx = c(47111L, 47111L, 47111L, 47111L, 47108L, 
47108L, 47107L, 47107L, 47106L, 47106L), NomNivelComplej = c("De 111 Hasta 130 U.V.R.", 
NA, "De 111 Hasta 130 U.V.R.", "De 111 Hasta 130 U.V.R.", "De 91 Hasta 100 U.V.R.", 
"De 131 Hasta 150 U.V.R.", "De 31 Hasta 40 U.V.R.", "De 71 Hasta 80 U.V.R.", 
"De 111 Hasta 130 U.V.R.", "De 111 Hasta 130 U.V.R.")), row.names = c(NA, 
10L), class = "data.frame")

CodePudding user response：

Just add na.rm = T to your max function, and then replace infinite values with NA.

df2$val <- sapply(strsplit(df2$NomNivelComplej , '\\D '), function(x) max(as.integer(x), na.rm = T))
df2$val <- replace(df2$val, is.infinite(df2$val), NA)

You could also use a built-in function (taken from here).

mymax <- function(x) ifelse( !all(is.na(x)), max(x, na.rm=T), NA)
df2$val <- sapply(strsplit(df2$NomNivelComplej , '\\D '), function(x) mymax(as.integer(x)))

Output:

# > df2
#     IdCx         NomNivelComplej val
# 1  47111 De 111 Hasta 130 U.V.R. 130
# 2  47111                    <NA>  NA
# 3  47111 De 111 Hasta 130 U.V.R. 130
# 4  47111 De 111 Hasta 130 U.V.R. 130
# 5  47108  De 91 Hasta 100 U.V.R. 100
# 6  47108 De 131 Hasta 150 U.V.R. 150
# 7  47107   De 31 Hasta 40 U.V.R.  40
# 8  47107   De 71 Hasta 80 U.V.R.  80
# 9  47106 De 111 Hasta 130 U.V.R. 130
# 10 47106 De 111 Hasta 130 U.V.R. 130

CodePudding user response：

A dplyr and stringr solution that doesn't assume anything about where the number is inside the string:

library(stringr)
library(dplyr)
get_max = function(x) {
    vals = unlist(str_split(x, "\\D"))
    max(as.numeric(vals[vals != ""]))
}

df2 %>% rowwise() %>% mutate(max_val = get_max(NomNivelComplej))

# A tibble: 10 × 3
# Rowwise: 
    IdCx NomNivelComplej         max_val
   <int> <chr>                     <dbl>
 1 47111 De 111 Hasta 130 U.V.R.     130
 2 47111 NA                           NA
 3 47111 De 111 Hasta 130 U.V.R.     130
 4 47111 De 111 Hasta 130 U.V.R.     130
 5 47108 De 91 Hasta 100 U.V.R.      100
 6 47108 De 131 Hasta 150 U.V.R.     150
 7 47107 De 31 Hasta 40 U.V.R.        40
 8 47107 De 71 Hasta 80 U.V.R.        80
 9 47106 De 111 Hasta 130 U.V.R.     130
10 47106 De 111 Hasta 130 U.V.R.     130

CodePudding user response：

You could adapt all `length<-`'s to 3, calculate the colMaxs (using matrixStats package) and replace if is.infinite with NA.

df2$val <- df2$NomNivelComplej  |>
  strsplit('\\D ') |>
  sapply(`length<-`, 3) |>
  type.convert(as.is=TRUE) |>
  matrixStats::colMaxs(na.rm=TRUE) |>
  (\(.) replace(., is.infinite(.), NA))()
df2
#     IdCx         NomNivelComplej val
# 1  47111 De 111 Hasta 130 U.V.R. 130
# 2  47111                    <NA>  NA
# 3  47111 De 111 Hasta 130 U.V.R. 130
# 4  47111 De 111 Hasta 130 U.V.R. 130
# 5  47108  De 91 Hasta 100 U.V.R. 100
# 6  47108 De 131 Hasta 150 U.V.R. 150
# 7  47107   De 31 Hasta 40 U.V.R.  40
# 8  47107   De 71 Hasta 80 U.V.R.  80
# 9  47106 De 111 Hasta 130 U.V.R. 130
# 10 47106 De 111 Hasta 130 U.V.R. 130

Note: R >= 4.1 used

Data:

df2 <- structure(list(IdCx = c(47111L, 47111L, 47111L, 47111L, 47108L, 
47108L, 47107L, 47107L, 47106L, 47106L), NomNivelComplej = c("De 111 Hasta 130 U.V.R.", 
NA, "De 111 Hasta 130 U.V.R.", "De 111 Hasta 130 U.V.R.", "De 91 Hasta 100 U.V.R.", 
"De 131 Hasta 150 U.V.R.", "De 31 Hasta 40 U.V.R.", "De 71 Hasta 80 U.V.R.", 
"De 111 Hasta 130 U.V.R.", "De 111 Hasta 130 U.V.R.")), row.names = c(NA, 
10L), class = "data.frame")

CodePudding user response：

You could do as follows:

temp <- c()
for (i in df2$NomNivelComplej) {
  j <- max(as.numeric(unlist(regmatches(i, gregexpr("[[:digit:]] ", i)))), na.rm = T)
  temp <- c(temp, j)
}

df2 <- cbind(df2, temp)

Output:

    IdCx         NomNivelComplej temp
1  47111 De 111 Hasta 130 U.V.R.  130
2  47111                    <NA> -Inf
3  47111 De 111 Hasta 130 U.V.R.  130
4  47111 De 111 Hasta 130 U.V.R.  130
5  47108  De 91 Hasta 100 U.V.R.  100
6  47108 De 131 Hasta 150 U.V.R.  150
7  47107   De 31 Hasta 40 U.V.R.   40
8  47107   De 71 Hasta 80 U.V.R.   80
9  47106 De 111 Hasta 130 U.V.R.  130
10 47106 De 111 Hasta 130 U.V.R.  130

Note: It returns -Inf in the case of NA. But, you can always take care of it.

CodePudding user response：

You can capitalize on the fact that the string-highest value always follows "Hasta ":

library(stringr)
library(dplyr)
df2 %>%
  mutate(max = str_extract(NomNivelComplej, "(?<=Hasta )\\d "))
    IdCx         NomNivelComplej  max
1  47111 De 111 Hasta 130 U.V.R.  130
2  47111                    <NA> <NA>
3  47111 De 111 Hasta 130 U.V.R.  130
4  47111 De 111 Hasta 130 U.V.R.  130
5  47108  De 91 Hasta 100 U.V.R.  100
6  47108 De 131 Hasta 150 U.V.R.  150
7  47107   De 31 Hasta 40 U.V.R.   40
8  47107   De 71 Hasta 80 U.V.R.   80
9  47106 De 111 Hasta 130 U.V.R.  130
10 47106 De 111 Hasta 130 U.V.R.  130

In base R:

library(stringr)
df2$max <- str_extract(df2$NomNivelComplej, "(?<=Hasta )\\d ")

CodePudding user response：

An option with base R

df2$Max <-  do.call(pmax, read.csv(text = trimws(gsub("\\D ", ",", 
     df2$NomNivelComplej), whitespace = ","), header = FALSE))

-output

> df2
    IdCx         NomNivelComplej Max
1  47111 De 111 Hasta 130 U.V.R. 130
2  47111                    <NA>  NA
3  47111 De 111 Hasta 130 U.V.R. 130
4  47111 De 111 Hasta 130 U.V.R. 130
5  47108  De 91 Hasta 100 U.V.R. 100
6  47108 De 131 Hasta 150 U.V.R. 150
7  47107   De 31 Hasta 40 U.V.R.  40
8  47107   De 71 Hasta 80 U.V.R.  80
9  47106 De 111 Hasta 130 U.V.R. 130
10 47106 De 111 Hasta 130 U.V.R. 130