find common values of a column based on group of another a column of data frame in R-CodePudding

I have data frame like this:

df<-tibble(id=c("ls1","ls1","ls1","ls2","ls2","ls3","ls5","ls5","ls10","ls10","ls14"),
               target=c("A","A","B","G","H","A","B","B","G","HA","B"))

I would like to have a list of common values from target column within groups of id and also between groups of id variable. The result can look like something like below table:

res<-tibble(id=c("ls1","ls1","ls1","ls2","ls2","ls3","ls5","ls5","ls10","ls10","ls14"),
            target=c("A","A","B","G","H","A","B","B","G","HA","B"),
            withinGroup=c(T,T,F,F,F,F,F,T,T,F,F),
            numberofRepwithinGroup=c(2,2,1,1,1,1,1,2,2,1,1),
            betweenGroups=c(T,T,T,T,F,T,T,T,T,F,T),
            numberofRepbetweenGroups=c(2,2,3,2,0,3,3,3,2,0,3))

Any idea how to do it?

CodePudding user response：

You can do it with a couple of mutate():

library(dplyr)

df |>
  # first group by
  group_by(id, target) |>
  # add the within columns
  mutate(numberofRepwithinGroup = length(target),
         withinGroup            = ifelse(numberofRepwithinGroup > 1,T,F)) |> 
  # second group by
  group_by(target) |>
  # add the between columns
  mutate(numberofRepbetweenGroups = ifelse(n_distinct(id) == 1, 0, n_distinct(id)),
         betweenGroups            = ifelse(numberofRepbetweenGroups  > 0,T,F)) |>
   # reorder columns
  select(id,target, withinGroup, numberofRepwithinGroup, betweenGroups, numberofRepbetweenGroups
  ) |> 
  # remove useless grouping
  ungroup()

# A tibble: 11 x 6
   id    target withinGroup numberofRepwithinGroup betweenGroups numberofRepbetweenGroups
   <chr> <chr>  <lgl>                        <int> <lgl>                            <dbl>
 1 ls1   A      TRUE                             2 TRUE                                 2
 2 ls1   A      TRUE                             2 TRUE                                 2
 3 ls1   B      FALSE                            1 TRUE                                 3
 4 ls2   G      FALSE                            1 TRUE                                 2
 5 ls2   H      FALSE                            1 FALSE                                0
 6 ls3   A      FALSE                            1 TRUE                                 2
 7 ls5   B      TRUE                             2 TRUE                                 3
 8 ls5   B      TRUE                             2 TRUE                                 3
 9 ls10  G      FALSE                            1 TRUE                                 2
10 ls10  HA     FALSE                            1 FALSE                                0
11 ls14  B      FALSE                            1 TRUE                                 3

CodePudding user response：

Here is an option

library(dplyr)
get_reps <- function(x) as.numeric(table(x)[match(x, names(table(x)))] - 1)
df %>%
    group_by(id) %>%
    mutate(
        withinGroup = duplicated(target) | duplicated(target, fromLast = T),
        numberofRepwithinGroup = get_reps(target)) %>%
    ungroup() %>%
    mutate(
        betweenGroups = duplicated(target) | duplicated(target, fromLast = T),
        numberofRepbetweenGroups = get_reps(target))
## A tibble: 11 x 6
#   id    target withinGroup numberofRepwithinGroup betweenGroups numberofRepbetweenGroups
#   <chr> <chr>  <lgl>                        <dbl> <lgl>                            <dbl>
# 1 ls1   A      TRUE                             1 TRUE                                 2
# 2 ls1   A      TRUE                             1 TRUE                                 2
# 3 ls1   B      FALSE                            0 TRUE                                 3
# 4 ls2   G      FALSE                            0 TRUE                                 1
# 5 ls2   H      FALSE                            0 FALSE                                0
# 6 ls3   A      FALSE                            0 TRUE                                 2
# 7 ls5   B      TRUE                             1 TRUE                                 3
# 8 ls5   B      TRUE                             1 TRUE                                 3
# 9 ls10  G      FALSE                            0 TRUE                                 1
#10 ls10  HA     FALSE                            0 FALSE                                0
#11 ls14  B      FALSE                            0 TRUE                                 3