Home > other >  Count the number of observations across multiple columns and create new categories
Count the number of observations across multiple columns and create new categories

Time:02-01

I have data like these:

ID      color_1    color_2   color_3    color_4
1       blue        NA        NA         NA
5       green       blue      yellow     NA
9       None        NA        NA         NA 
2       blue        pink      NA         NA 
11      green       NA        NA         NA 

I want to add a new column that counts the number of colors (None, 1, or 2 or more):

ID      color_1    color_2   color_3    color_4    colors
1       blue        NA        NA         NA          1
5       green       blue      yellow     NA          2 or more
9       None        NA        NA         NA          None 
2       blue        pink      NA         NA          2 or more 
11      green       NA        NA         NA          1

I am thinking I can do something like this:

library(tidyverse)
data2 <- data %>%
 mutate(colors = case_when(
         color_1 == "None" ~ "None",
         

But I can't figure out how to do the counting part. Any help is appreciated!

CodePudding user response:

You can count using c_across and rowwise:

library(tidyverse)
df %>% 
  na_if("None") %>% 
  rowwise() %>% 
  mutate(count = sum(!is.na(c_across(color_1:color_4))),
         colors = case_when(count >= 2 ~ "2 or more",
                            count == 0 ~ "None",
                            T ~ as.character(count))) %>% 
  select(-count)

Output

     ID color_1 color_2 color_3 color_4 colors   
  <int> <chr>   <chr>   <chr>   <lgl>   <chr>    
1     1 blue    NA      NA      NA      1        
2     5 green   blue    yellow  NA      2 or more
3     9 NA      NA      NA      NA      None     
4     2 blue    pink    NA      NA      2 or more
5    11 green   NA      NA      NA      1        

CodePudding user response:

the approach is similar to @Maël, but using rowSums

df <- data.frame(
  stringsAsFactors = FALSE,
           ID = c(1L, 5L, 9L, 2L, 11L),
           color_1 = c("blue", "green", "None", "blue", "green"),
           color_2 = c(NA, "blue", NA, "pink", NA),
           color_3 = c(NA, "yellow", NA, NA, NA),
           color_4 = c(NA, NA, NA, NA, NA)
)

library(tidyverse)
library(data.table)


df %>%
  na_if("None") %>%
  mutate(
    count = rowSums(across(starts_with("color"), ~ !is.na(.x))),
    colors = case_when(count >= 2 ~ "2 or more",
                       count == 0 ~ "None",
                       T ~ as.character(count))
  ) %>% 
  select(-count)
#>   ID color_1 color_2 color_3 color_4    colors
#> 1  1    blue    <NA>    <NA>      NA         1
#> 2  5   green    blue  yellow      NA 2 or more
#> 3  9    <NA>    <NA>    <NA>      NA      None
#> 4  2    blue    pink    <NA>      NA 2 or more
#> 5 11   green    <NA>    <NA>      NA         1


# data.table

COLS <- grepl("^color", names(df))

setDT(df) %>%
  na_if("None") %>%
  .[, count := rowSums(!is.na(.SD)), .SDcols = COLS] %>%
  .[, colors := fcase(count >= 2, "2 or more",
                      count == 0, "None",
                      default = "1")] %>%
  .[, count := NULL] %>% 
  .[]
#>    ID color_1 color_2 color_3 color_4    colors
#> 1:  1    blue    <NA>    <NA>      NA         1
#> 2:  5   green    blue  yellow      NA 2 or more
#> 3:  9    <NA>    <NA>    <NA>      NA      None
#> 4:  2    blue    pink    <NA>      NA 2 or more
#> 5: 11   green    <NA>    <NA>      NA         1

Created on 2022-01-31 by the reprex package (v2.0.1)

  •  Tags:  
  • Related