I have data frame like this:
df<-tibble(id=c("ls1","ls1","ls1","ls2","ls2","ls3","ls5","ls5","ls10","ls10","ls14"),
target=c("A","A","B","G","H","A","B","B","G","HA","B"))
I would like to have a list of common values from target column within groups of id and also between groups of id variable. The result can look like something like below table:
res<-tibble(id=c("ls1","ls1","ls1","ls2","ls2","ls3","ls5","ls5","ls10","ls10","ls14"),
target=c("A","A","B","G","H","A","B","B","G","HA","B"),
withinGroup=c(T,T,F,F,F,F,F,T,T,F,F),
numberofRepwithinGroup=c(2,2,1,1,1,1,1,2,2,1,1),
betweenGroups=c(T,T,T,T,F,T,T,T,T,F,T),
numberofRepbetweenGroups=c(2,2,3,2,0,3,3,3,2,0,3))
Any idea how to do it?
CodePudding user response:
You can do it with a couple of mutate()
:
library(dplyr)
df |>
# first group by
group_by(id, target) |>
# add the within columns
mutate(numberofRepwithinGroup = length(target),
withinGroup = ifelse(numberofRepwithinGroup > 1,T,F)) |>
# second group by
group_by(target) |>
# add the between columns
mutate(numberofRepbetweenGroups = ifelse(n_distinct(id) == 1, 0, n_distinct(id)),
betweenGroups = ifelse(numberofRepbetweenGroups > 0,T,F)) |>
# reorder columns
select(id,target, withinGroup, numberofRepwithinGroup, betweenGroups, numberofRepbetweenGroups
) |>
# remove useless grouping
ungroup()
# A tibble: 11 x 6
id target withinGroup numberofRepwithinGroup betweenGroups numberofRepbetweenGroups
<chr> <chr> <lgl> <int> <lgl> <dbl>
1 ls1 A TRUE 2 TRUE 2
2 ls1 A TRUE 2 TRUE 2
3 ls1 B FALSE 1 TRUE 3
4 ls2 G FALSE 1 TRUE 2
5 ls2 H FALSE 1 FALSE 0
6 ls3 A FALSE 1 TRUE 2
7 ls5 B TRUE 2 TRUE 3
8 ls5 B TRUE 2 TRUE 3
9 ls10 G FALSE 1 TRUE 2
10 ls10 HA FALSE 1 FALSE 0
11 ls14 B FALSE 1 TRUE 3
CodePudding user response:
Here is an option
library(dplyr)
get_reps <- function(x) as.numeric(table(x)[match(x, names(table(x)))] - 1)
df %>%
group_by(id) %>%
mutate(
withinGroup = duplicated(target) | duplicated(target, fromLast = T),
numberofRepwithinGroup = get_reps(target)) %>%
ungroup() %>%
mutate(
betweenGroups = duplicated(target) | duplicated(target, fromLast = T),
numberofRepbetweenGroups = get_reps(target))
## A tibble: 11 x 6
# id target withinGroup numberofRepwithinGroup betweenGroups numberofRepbetweenGroups
# <chr> <chr> <lgl> <dbl> <lgl> <dbl>
# 1 ls1 A TRUE 1 TRUE 2
# 2 ls1 A TRUE 1 TRUE 2
# 3 ls1 B FALSE 0 TRUE 3
# 4 ls2 G FALSE 0 TRUE 1
# 5 ls2 H FALSE 0 FALSE 0
# 6 ls3 A FALSE 0 TRUE 2
# 7 ls5 B TRUE 1 TRUE 3
# 8 ls5 B TRUE 1 TRUE 3
# 9 ls10 G FALSE 0 TRUE 1
#10 ls10 HA FALSE 0 FALSE 0
#11 ls14 B FALSE 0 TRUE 3