Home > database >  How to calculate the sum of distinct observations in R dplyr
How to calculate the sum of distinct observations in R dplyr

Time:03-17

I am quite puzzled. While I know how to count the sum of distinct_values per group with the n_distinct(), its seems challenging to me at the moment to find the sum of the unique of the unique observations.

I want to group by id, and then sum each unique value and.....

library(tidyverse)

df <- tibble(col1 = c("apple","apple","pple", "banana", "banana","bananna"),
             col2 = c("pple","app","app", "bananna", "banan", "banan"), 
             counts_col1 = c(100,100,2,200,200,2),
             counts_col2 = c(2,50,50,2,20,20),
             id=c(1,1,1,2,2,2))

df1 <- df %>% 
  pivot_longer(c(counts_col1:counts_col2),names_to ="strings",values_to = "value") %>% 
  group_by(id,col1,col2) %>% 
  ungroup() %>% 
  group_by(id) 

df1
#> # A tibble: 12 × 5
#> # Groups:   id [2]
#>    col1    col2       id strings     value
#>    <chr>   <chr>   <dbl> <chr>       <dbl>
#>  1 apple   pple        1 counts_col1   100
#>  2 apple   pple        1 counts_col2     2
#>  3 apple   app         1 counts_col1   100
#>  4 apple   app         1 counts_col2    50
#>  5 pple    app         1 counts_col1     2
#>  6 pple    app         1 counts_col2    50
#>  7 banana  bananna     2 counts_col1   200
#>  8 banana  bananna     2 counts_col2     2
#>  9 banana  banan       2 counts_col1   200
#> 10 banana  banan       2 counts_col2    20
#> 11 bananna banan       2 counts_col1     2
#> 12 bananna banan       2 counts_col2    20

Created on 2022-03-16 by the reprex package (v2.0.1)

...end up in something like this


#>    col1    col2       id strings     value    sum_distinct
#>    <chr>   <chr>   <dbl> <chr>       <dbl>      
#>  1 apple   pple        1 counts_col1   100    152
#>  2 apple   pple        1 counts_col2     2    NA
#>  3 apple   app         1 counts_col1   100    NA
#>  4 apple   app         1 counts_col2    50    NA
#>  5 pple    app         1 counts_col1     2    NA
#>  6 pple    app         1 counts_col2    50    NA
#>  7 banana  bananna     2 counts_col1   200    222
#>  8 banana  bananna     2 counts_col2     2    NA
#>  9 banana  banan       2 counts_col1   200    NA
#> 10 banana  banan       2 counts_col2    20    NA
#> 11 bananna banan       2 counts_col1     2    NA
#> 12 bananna banan       2 counts_col2    20    NA

CodePudding user response:

We may use replace with unique

library(dplyr)
library(tidyr)
df %>% 
  pivot_longer(c(counts_col1:counts_col2), 
      names_to ="strings",values_to = "value") %>% 
  group_by(id,col1,col2) %>%    
  group_by(id) %>%
  mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
  ungroup

-output

# A tibble: 12 × 6
   col1    col2       id strings     value sum_distinct
   <chr>   <chr>   <dbl> <chr>       <dbl>        <dbl>
 1 apple   pple        1 counts_col1   100          152
 2 apple   pple        1 counts_col2     2           NA
 3 apple   app         1 counts_col1   100           NA
 4 apple   app         1 counts_col2    50           NA
 5 pple    app         1 counts_col1     2           NA
 6 pple    app         1 counts_col2    50           NA
 7 banana  bananna     2 counts_col1   200          222
 8 banana  bananna     2 counts_col2     2           NA
 9 banana  banan       2 counts_col1   200           NA
10 banana  banan       2 counts_col2    20           NA
11 bananna banan       2 counts_col1     2           NA
12 bananna banan       2 counts_col2    20           NA

CodePudding user response:

Using data.table, you could do:

Reprex

  • Code
library(tidyverse) # to read your tibble
library(data.table)

# 1 - Building your starting data.table
df <- melt(setDT(df),
  id.vars = c("col1", "col2", "id"),
  measure.vars = c("counts_col1", "counts_col2"),
  variable.name = "string")[order(id, col1, -col2)]

df
#>        col1    col2 id      string value
#>  1:   apple    pple  1 counts_col1   100
#>  2:   apple    pple  1 counts_col2     2
#>  3:   apple     app  1 counts_col1   100
#>  4:   apple     app  1 counts_col2    50
#>  5:    pple     app  1 counts_col1     2
#>  6:    pple     app  1 counts_col2    50
#>  7:  banana bananna  2 counts_col1   200
#>  8:  banana bananna  2 counts_col2     2
#>  9:  banana   banan  2 counts_col1   200
#> 10:  banana   banan  2 counts_col2    20
#> 11: bananna   banan  2 counts_col1     2
#> 12: bananna   banan  2 counts_col2    20

# 2 - Computing the 'sum_distinct' column
df[, sum_distinct := sum(unique(value)), by = id
   ][!df[, .I[1], by = id]$V1, sum_distinct := NA_integer_][]
  • Output
#>        col1    col2    id      string value sum_distinct
#>      <char>  <char> <num>      <fctr> <num>        <num>
#>  1:   apple    pple     1 counts_col1   100          152
#>  2:   apple    pple     1 counts_col2     2           NA
#>  3:   apple     app     1 counts_col1   100           NA
#>  4:   apple     app     1 counts_col2    50           NA
#>  5:    pple     app     1 counts_col1     2           NA
#>  6:    pple     app     1 counts_col2    50           NA
#>  7:  banana bananna     2 counts_col1   200          222
#>  8:  banana bananna     2 counts_col2     2           NA
#>  9:  banana   banan     2 counts_col1   200           NA
#> 10:  banana   banan     2 counts_col2    20           NA
#> 11: bananna   banan     2 counts_col1     2           NA
#> 12: bananna   banan     2 counts_col2    20           NA

Created on 2022-03-16 by the reprex package (v2.0.1)

  • Related