Multiplying and Adding Values across Rows-CodePudding

I have this data frame:

color <- c("AKZ", "ZZA", "KAK")    
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100

sample_data = data.frame(id, color_1)


 id color_1
1  1     KAK
2  2     AKZ
3  3     KAK
4  4     KAK
5  5     AKZ
6  6     ZZA

Suppose there is a legend:

K = 3
A = 4
Z = 6

I want to add two columns to the above data frame:

sample_data$add_score : e.g. KAK = K A K = 3 4 3 = 10
sample_data$multiply_score : e.g. KAK = K * A * K = 3 * 4 * 3 = 36

I thought of solving the problem like this:

sample_data$first = substr(color_1,1,1)
sample_data$second = substr(color_1,2,2)
sample_data$third = substr(color_1,3,3)

sample_data$first_score = ifelse(sample_data$first == "K", 3, ifelse(sample_data$first == "A", 4, 6))
 
sample_data$second_score = ifelse(sample_data$second == "K", 3, ifelse(sample_data$second == "A", 4, 6))

sample_data$third_score = ifelse(sample_data$third == "K", 3, ifelse(sample_data$third == "A", 4, 6))

sample_data$add_score = sample_data$first_score   sample_data$second_score   sample_data$third_score

sample_data$multiply_score = sample_data$first_score * sample_data$second_score * sample_data$third_score

But I think this way would take a long time if the length of "color_1" was longer. Given a scoring legend, is there a faster way to do this?

Thank you!

CodePudding user response：

Here is a way.
The main trick is to strsplit into single characters and match these vectors with the legend. Then add or multiply the matching numbers.

set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")    
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id <- 1:100

sample_data = data.frame(id, color_1)

legend <- setNames(c(3, 4, 6), c("K", "A", "Z"))

add_mul <- function(x, l){
  add <- function(y, l){
    i <- match(y, names(l))
    sum(l[i])
  }
  mul <- function(y, l){
    i <- match(y, names(l))
    prod(l[i])
  }
  
  s <- strsplit(x, "")
  add_score <- sapply(s, add, l = l)
  mul_score <- sapply(s, mul, l = l)
  data.frame(add_score, mul_score)
}

sample_data <- cbind(sample_data, add_mul(sample_data$color_1, legend))
head(sample_data)
#>   id color_1 add_score mul_score
#> 1  1     ZZA        16       144
#> 2  2     KAK        10        36
#> 3  3     AKZ        13        72
#> 4  4     KAK        10        36
#> 5  5     AKZ        13        72
#> 6  6     KAK        10        36

^{Created on 2022-03-10 by the}

CodePudding user response：

Here’s an approach based on tidyr::separate_rows() followed by a grouped dplyr::summarize():

library(tidyverse)
set.seed(1)

legend <- c(K = 3, A = 4, Z = 6)

sample_data %>%
  mutate(decoded = color_1) %>%
  separate_rows(decoded, sep = "(?!^)") %>%
  mutate(decoded = legend[decoded]) %>%
  group_by(id, color_1) %>%
  summarize(
    add_score = sum(decoded),
    multiply_score = prod(decoded), 
    .groups = "drop"
  )

Output:

# A tibble: 100 x 4
      id color_1 add_score multiply_score
   <int> <chr>       <dbl>          <dbl>
 1     1 AKZ            13             72
 2     2 AKZ            13             72
 3     3 KAK            10             36
 4     4 ZZA            16            144
 5     5 AKZ            13             72
 6     6 ZZA            16            144
 7     7 ZZA            16            144
 8     8 KAK            10             36
 9     9 KAK            10             36
10    10 AKZ            13             72
# ... with 90 more rows

CodePudding user response：

We can use stri_replace_all_regex to replace your color_1 into integers together with the arithmetic operator.

Here I've stored your values into a vector color_1_convert. We can use this as the input in stri_replace_all_regex for better management of the values.

library(dplyr)
library(stringi)

color_1_convert <- c("K" = "3", "A" = "4", "Z" = "6")

sample_data %>%
  group_by(id) %>%
  mutate(add_score = eval(parse(text = gsub("\\ $", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, " "), vectorize_all = F)))),
         multiply_score = eval(parse(text = gsub("\\*$", "", stri_replace_all_regex(color_1, names(color_1_convert), paste0(color_1_convert, "*"), vectorize_all = F)))))

# A tibble: 100 × 4
# Groups:   id [100]
      id color_1 add_score multiply_score
   <int> <chr>       <dbl>          <dbl>
 1     1 KAK            10             36
 2     2 ZZA            16            144
 3     3 AKZ            13             72
 4     4 ZZA            16            144
 5     5 AKZ            13             72
 6     6 AKZ            13             72
 7     7 AKZ            13             72
 8     8 KAK            10             36
 9     9 ZZA            16            144
10    10 AKZ            13             72
# … with 90 more rows

CodePudding user response：

Yet another alternative, using some libraries optimized for speed, stringi for string manipulation and Rfast for matrix operations. Note that when any NA values are present in your data matrixStats is safer to use than Rfast.

set.seed(2022)
color <- c("AKZ", "ZZA", "KAK")    
color_1 <- sample(color, 100, replace=TRUE, prob=c(0.4, 0.3, 0.3))
id = 1:100
sample_data = data.frame(id, color_1)

m <- strsplit(sample_data[["color_1"]], "") |>
  unlist(use.names = F) |>
  stringi::stri_replace_all_regex(
    c("K", "A", "Z"),
    c("3", "4", "6"), vectorize_all = F) |>
  as.integer() |>
  matrix(ncol = 3, byrow = T)
sample_data$add_score <- Rfast::rowsums(m)
sample_data$mul_score <- Rfast::rowprods(m)

head(sample_data)
  id color_1 add_score mul_score
1  1     ZZA        16       144
2  2     KAK        10        36
3  3     AKZ        13        72
4  4     KAK        10        36
5  5     AKZ        13        72
6  6     KAK        10        36