Home > Blockchain >  Select unique cases of a column from a list of lists with tidyverse (R)
Select unique cases of a column from a list of lists with tidyverse (R)

Time:07-26

In the example below, I select from a list of lists the unique cases of specific column. I think I have successfully done what I want but I would like to know if the datasets can be organised in a more tidy way (e.g., not using lists for organising the data like this) and use fully the tidyverse to achieve the same output (e.g., with purr or similar). As you can see I use partly the tidyverse but I'm interested in seeing if it's possible to avoid the for loop, the lapply and maybe not using lists to organise the data like this.


#Load libraries
library(tidyverse)
library(stringi)

#Generate datasets to create example
dat1 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))
dat2 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))
dat3 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))
dat4 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))

#Generate lists from the generated datasets (3 of them for example)
list1 <- list(id1 = dat1, id2 = dat2)
list2 <- list(id1 = dat3)
list3 <- list(id1 = dat4)
#Generate list of lists
final_list <- list(dataset1 = list1, dataset2 = list2, dataset3 = list3)

#Select unique cases from the list of lists
#First loop for each list (1, 2 and 3) and then within list select unique cases with lapply for col2

data <- NULL

for (i in final_list) {

single_cases <- bind_rows(lapply(i, function(x) x %>% select(col2) %>% distinct(col2)))

data <- rbind(data, single_cases)  
  
}

CodePudding user response:

You could do:

data1 <- c(list1, list2, list3) %>%
  setNames(paste0("id", seq_along(.))) %>%
  bind_rows(.id = "id")  %>%
  distinct(id, col2) %>%
  select(-id)

And the complete reproducible example showing that your data and my data1 are identical:

library(dplyr)
library(stringi)

set.seed(123)

dat1 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))
dat2 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))
dat3 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))
dat4 <- tibble(col1 = runif(10), col2 = stri_rand_strings(10, 5))

list1 <- list(id1 = dat1, id2 = dat2)
list2 <- list(id1 = dat3)
list3 <- list(id1 = dat4)

final_list <- list(dataset1 = list1, dataset2 = list2, dataset3 = list3)

data <- NULL
for (i in final_list) {
  single_cases <- bind_rows(lapply(i, function(x) x %>% select(col2) %>% distinct(col2)))
  data <- rbind(data, single_cases)  
}

data1 <- c(list1, list2, list3) %>%
  setNames(paste0("id", seq_along(.))) %>%
  bind_rows(.id = "id")  %>%
  distinct(id, col2) %>%
  select(-id)

identical(data, data1)
#> [1] TRUE
  •  Tags:  
  • r
  • Related