Home > Software design >  How to count characters in a string based on their content in a vector
How to count characters in a string based on their content in a vector

Time:11-12

I have the following vectors:

v1 <- c("R", "H", "K")  # * (asterisk sign)
v2 <- c("D", "E")       #   (plus sign)
v3 <- c("A")            # - (minus sign)

Given another string, I'd like to count how many characters belong to v1 and v2. For example:

x1 <- "GMRRRARRRS"
#        ***-***
# v1_count = 6
# v2_count = 0
# v3_count = 1

x2 <- "KMRDFRHRAE"
#      * *  ***- 
# v1_count = 5
# v2_count = 2
# v3_count = 1

So any character that belongs to the vector will be counted as a single count.

The final output will be a data frame or tibble:

              R,H,K        D,E       A
GMRRRARRRS      6           0        1
KMRDFRHRAE      5           2        1

How can I achieve that with R?

CodePudding user response:

library(tidyverse)

Sample data in a tibble or data frame

# A tibble: 2 x 1
  string    
  <chr>     
1 GMRRRARRRS
2 KMRDFRHRAE

Create a function to extract textual information

get_count <- function(string) {
  v1 <- c("R", "H", "K") 
  v2 <- c("D", "E")      
  v3 <- c("A")
  
  char <- string %>%
    str_split("") %>%
    getElement(1)
  
  tibble(
    "RHK" = length(char[char %in% v1]),
    "DE" = length(char[char %in% v2]),
    "A" = length(char[char %in% v3])
  )
}

Mutate a new column data with the function and unnest()

df %>% 
  mutate(data = map(string, get_count)) %>% 
  unnest(everything())

# A tibble: 2 x 4
  string       RHK    DE     A
  <chr>      <int> <int> <int>
1 GMRRRARRRS     6     0     1
2 KMRDFRHRAE     5     2     1

CodePudding user response:

library(stringr)
library(data.table)

v1 <- c('R', 'H', 'K') 
v2 <- c('D', 'E')   
v3 <- c('A') 

x1 <- 'GMRRRARRRS'
x2 <- 'KMRDFRHRAE'

char_counts <- function(input_str) {
  vars <- c('v1', 'v2', 'v3')
  counts <- lapply(vars, function(x) { 
    (strsplit(input_str, '')[[ 1 ]] %in% get(x)) |> 
      sum()
  })
  df <- data.frame(input_str, unlist(counts) |> t())
  cn <- lapply(vars, get) |> lapply(paste0, collapse = ',') |> unlist()
  colnames(df) <- c('input', cn)
  df
}
results_df <- lapply(c(x1, x2), char_counts) |> 
  data.table::rbindlist(fill = TRUE) |> 
  as.data.frame()
print(results_df)
       input R,H,K D,E A
1 GMRRRARRRS     6   0 1
2 KMRDFRHRAE     5   2 1

CodePudding user response:

I think we can do this a little simpler. Maybe just:

library(tidyverse)

x1 <- "GMRRRARRRS"
x2 <- "KMRDFRHRAE"

count_v <- function(string){
  tibble(text = string,
         RHK = str_count(string, paste(c("R", "H", "K"), collapse = "|")),
         DE = str_count(string, paste(c("D", "E"), collapse = "|")),
         A = str_count(string, 'A'))
}

count_v(c(x1,x2))
#> # A tibble: 2 x 4
#>   text         RHK    DE     A
#>   <chr>      <int> <int> <int>
#> 1 GMRRRARRRS     6     0     1
#> 2 KMRDFRHRAE     5     2     1

or with a little more flexibility:

library(tidyverse)

x1 <- "GMRRRARRRS"
x2 <- "KMRDFRHRAE"


count_v <- function(string, checks){
  bind_cols(tibble(text = string),
            map_dfc(checks, \(x){
              tibble("{paste(x, collapse = '')}" := str_count(string, 
                                                              paste(x, collapse = "|")))
              }) )
}

count_v(string = c(x1, x2),
        checks = list(c("R", "H", "K"),
                      c("D", "E"),
                      c("A", "S"),
                      c("K", "F", "G")))
#> # A tibble: 2 x 5
#>   text         RHK    DE    AS   KFG
#>   <chr>      <int> <int> <int> <int>
#> 1 GMRRRARRRS     6     0     2     1
#> 2 KMRDFRHRAE     5     2     1     2
  • Related