Handling duplicated entries-CodePudding

I would like to reassign a given records to a single group if the records are duplicated. In the below dataset I would like to to have 12-4 all being assigned to group A or B but not both. Is there a way to go abou it?

library(tidyverse)

dat <- tibble(
  group = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
  assigned = c("12-1", "12-2", "12-3", "12-4", "12-4", "12-5", "12-6",
               "12-7", "12-8")
)

# Attempts to tease out records for each group
dat %>% pivot_wider(names_from = group, values_from = assigned)

CodePudding user response：

You can group by record and reassign all to the same group, chosen at random from the available groups:

dat %>%
  group_by(assigned) %>%
  mutate(group = nth(group, sample(n())[1])) %>%
  ungroup()

#> # A tibble: 9 x 2
#>   group assigned
#>   <chr> <chr>   
#> 1 A     12-1    
#> 2 A     12-2    
#> 3 A     12-3    
#> 4 A     12-4    
#> 5 A     12-4    
#> 6 B     12-5    
#> 7 B     12-6    
#> 8 B     12-7    
#> 9 B     12-8

CodePudding user response：

library(tidyverse)

dat <- tibble(
  group = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
  assigned = c(
    "12-1", "12-2", "12-3", "12-4", "12-4", "12-5", "12-6",
    "12-7", "12-8"
  )
)

dat %>%
  select(-group) %>%
  left_join(
    dat %>%
      left_join(dat %>% count(group)) %>%
      # reassign to the smallest group
      arrange(n) %>%
      select(-n) %>%
      distinct(assigned, .keep_all = TRUE)
  )
#> Joining, by = "group"
#> Joining, by = "assigned"
#> # A tibble: 9 × 2
#>   assigned group
#>   <chr>    <chr>
#> 1 12-1     A    
#> 2 12-2     A    
#> 3 12-3     A    
#> 4 12-4     A    
#> 5 12-4     A    
#> 6 12-5     B    
#> 7 12-6     B    
#> 8 12-7     B    
#> 9 12-8     B

^{Created on 2022-04-04 by the reprex package (v2.0.0)}