I am quite puzzled. While I know how to count the sum of distinct_values per group with the n_distinct(), its seems challenging to me at the moment to find the sum of the unique of the unique observations.
I want to group by id, and then sum each unique value and.....
library(tidyverse)
df <- tibble(col1 = c("apple","apple","pple", "banana", "banana","bananna"),
col2 = c("pple","app","app", "bananna", "banan", "banan"),
counts_col1 = c(100,100,2,200,200,2),
counts_col2 = c(2,50,50,2,20,20),
id=c(1,1,1,2,2,2))
df1 <- df %>%
pivot_longer(c(counts_col1:counts_col2),names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
ungroup() %>%
group_by(id)
df1
#> # A tibble: 12 × 5
#> # Groups: id [2]
#> col1 col2 id strings value
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100
#> 2 apple pple 1 counts_col2 2
#> 3 apple app 1 counts_col1 100
#> 4 apple app 1 counts_col2 50
#> 5 pple app 1 counts_col1 2
#> 6 pple app 1 counts_col2 50
#> 7 banana bananna 2 counts_col1 200
#> 8 banana bananna 2 counts_col2 2
#> 9 banana banan 2 counts_col1 200
#> 10 banana banan 2 counts_col2 20
#> 11 bananna banan 2 counts_col1 2
#> 12 bananna banan 2 counts_col2 20
Created on 2022-03-16 by the reprex package (v2.0.1)
...end up in something like this
#> col1 col2 id strings value sum_distinct
#> <chr> <chr> <dbl> <chr> <dbl>
#> 1 apple pple 1 counts_col1 100 152
#> 2 apple pple 1 counts_col2 2 NA
#> 3 apple app 1 counts_col1 100 NA
#> 4 apple app 1 counts_col2 50 NA
#> 5 pple app 1 counts_col1 2 NA
#> 6 pple app 1 counts_col2 50 NA
#> 7 banana bananna 2 counts_col1 200 222
#> 8 banana bananna 2 counts_col2 2 NA
#> 9 banana banan 2 counts_col1 200 NA
#> 10 banana banan 2 counts_col2 20 NA
#> 11 bananna banan 2 counts_col1 2 NA
#> 12 bananna banan 2 counts_col2 20 NA
CodePudding user response:
We may use replace
with unique
library(dplyr)
library(tidyr)
df %>%
pivot_longer(c(counts_col1:counts_col2),
names_to ="strings",values_to = "value") %>%
group_by(id,col1,col2) %>%
group_by(id) %>%
mutate(sum_distinct = replace(rep(NA_real_, n()), 1, sum(unique(value)))) %>%
ungroup
-output
# A tibble: 12 × 6
col1 col2 id strings value sum_distinct
<chr> <chr> <dbl> <chr> <dbl> <dbl>
1 apple pple 1 counts_col1 100 152
2 apple pple 1 counts_col2 2 NA
3 apple app 1 counts_col1 100 NA
4 apple app 1 counts_col2 50 NA
5 pple app 1 counts_col1 2 NA
6 pple app 1 counts_col2 50 NA
7 banana bananna 2 counts_col1 200 222
8 banana bananna 2 counts_col2 2 NA
9 banana banan 2 counts_col1 200 NA
10 banana banan 2 counts_col2 20 NA
11 bananna banan 2 counts_col1 2 NA
12 bananna banan 2 counts_col2 20 NA
CodePudding user response:
Using data.table
, you could do:
Reprex
- Code
library(tidyverse) # to read your tibble
library(data.table)
# 1 - Building your starting data.table
df <- melt(setDT(df),
id.vars = c("col1", "col2", "id"),
measure.vars = c("counts_col1", "counts_col2"),
variable.name = "string")[order(id, col1, -col2)]
df
#> col1 col2 id string value
#> 1: apple pple 1 counts_col1 100
#> 2: apple pple 1 counts_col2 2
#> 3: apple app 1 counts_col1 100
#> 4: apple app 1 counts_col2 50
#> 5: pple app 1 counts_col1 2
#> 6: pple app 1 counts_col2 50
#> 7: banana bananna 2 counts_col1 200
#> 8: banana bananna 2 counts_col2 2
#> 9: banana banan 2 counts_col1 200
#> 10: banana banan 2 counts_col2 20
#> 11: bananna banan 2 counts_col1 2
#> 12: bananna banan 2 counts_col2 20
# 2 - Computing the 'sum_distinct' column
df[, sum_distinct := sum(unique(value)), by = id
][!df[, .I[1], by = id]$V1, sum_distinct := NA_integer_][]
- Output
#> col1 col2 id string value sum_distinct
#> <char> <char> <num> <fctr> <num> <num>
#> 1: apple pple 1 counts_col1 100 152
#> 2: apple pple 1 counts_col2 2 NA
#> 3: apple app 1 counts_col1 100 NA
#> 4: apple app 1 counts_col2 50 NA
#> 5: pple app 1 counts_col1 2 NA
#> 6: pple app 1 counts_col2 50 NA
#> 7: banana bananna 2 counts_col1 200 222
#> 8: banana bananna 2 counts_col2 2 NA
#> 9: banana banan 2 counts_col1 200 NA
#> 10: banana banan 2 counts_col2 20 NA
#> 11: bananna banan 2 counts_col1 2 NA
#> 12: bananna banan 2 counts_col2 20 NA
Created on 2022-03-16 by the reprex package (v2.0.1)