Home > Enterprise >  Add column to a dataframe depending on presence within a list in R
Add column to a dataframe depending on presence within a list in R

Time:09-17

I have a dataframe such as

GroupsA GroupsB 
G1      G2    
G1      G3     
G1      G4      
G1      G5      
G1      G6      
G2      G3    
G2      G4
G2      G5
G2      G6
G3      G4
G3      G5
G3      G6
G4      G5
G4      G6
G5      G6

And I have 3 lists such as :

list1<-c("G1","G3")
list2<-c("G2")
list3<-c("G4","G5","G6")

and I would like to add a new column to the dataframe by adding the name of the list from which the GroupA and B come from such as

GroupsA GroupsB Lists
G1      G2      list1&list2
G1      G3      list1&list1
G1      G4      list1&list3
G1      G5      list1&list3
G1      G6      list1&list3
G2      G3      list2&list1
G2      G4      list2&list3
G2      G5      list2&list3
G2      G6      list2&list3
G3      G4      list1&list3
G3      G5      list1&list3
G3      G6      list1&list3
G4      G5      list3&list3
G4      G6      list3&list3
G5      G6      list3&list3

Here is the dput format of the df if it can helps :

structure(list(GroupsA = c("G1", "G1", "G1", "G1", "G1", "G2", 
"G2", "G2", "G2", "G3", "G3", "G3", "G4", "G4", "G5"), GroupsB = c("G2", 
"G3", "G4", "G5", "G6", "G3", "G4", "G5", "G6", "G4", "G5", "G6", 
"G5", "G6", "G6")), class = "data.frame", row.names = c(NA, -15L
))

CodePudding user response:

A base R solution can be to obtain the lists from your global environment, stack them in a data frame and match with your original data frame, i.e.

d1 <- stack(mget(grep('list', ls(), value = TRUE)))
df$Lists <- do.call(paste, 
                    c(data.frame(sapply(df, function(i)d1$ind[match(i, d1$values)])), 
                      sep = '&'))


   GroupsA GroupsB       Lists
1       G1      G2 list1&list2
2       G1      G3 list1&list1
3       G1      G4 list1&list3
4       G1      G5 list1&list3
5       G1      G6 list1&list3
6       G2      G3 list2&list1
7       G2      G4 list2&list3
8       G2      G5 list2&list3
9       G2      G6 list2&list3
10      G3      G4 list1&list3
11      G3      G5 list1&list3
12      G3      G6 list1&list3
13      G4      G5 list3&list3
14      G4      G6 list3&list3
15      G5      G6 list3&list3

DATA

df <- structure(list(GroupsA = c("G1", "G1", "G1", "G1", "G1", "G2", 
"G2", "G2", "G2", "G3", "G3", "G3", "G4", "G4", "G5"), GroupsB = c("G2", 
"G3", "G4", "G5", "G6", "G3", "G4", "G5", "G6", "G4", "G5", "G6", 
"G5", "G6", "G6")), class = "data.frame", row.names = c(NA, -15L
))

CodePudding user response:

In tidyverse:

library(tidyverse)
lists <- stack(lst(list1, list2, list3))
bind_cols(df, 
          df %>% 
            map_dfc(~ lists$ind[match(.x, lists$values)]) %>% 
            unite(col = "Lists", sep = "&"))
   GroupsA GroupsB       Lists
1       G1      G2 list1&list2
2       G1      G3 list1&list1
3       G1      G4 list1&list3
4       G1      G5 list1&list3
5       G1      G6 list1&list3
6       G2      G3 list2&list1
7       G2      G4 list2&list3
8       G2      G5 list2&list3
9       G2      G6 list2&list3
10      G3      G4 list1&list3
11      G3      G5 list1&list3
12      G3      G6 list1&list3
13      G4      G5 list3&list3
14      G4      G6 list3&list3
15      G5      G6 list3&list3

CodePudding user response:

With data.table sqldf

library(sqldf)
library(data.table)
X   = list(list1 = list1, list2 = list2, list3 = list3)
dt  = rbindlist(lapply(X, as.data.table), idcol = 'list_from')
sqldf("SELECT x.V1 AS GroupsA, 
              y.V1 AS GroupsB, 
              (x.list_from ||'&'|| y.list_from) AS Lists
       FROM dt AS x CROSS JOIN dt AS y 
       WHERE x.V1 < y.V1")

#    GroupsA GroupsB       Lists
# 1       G1      G3 list1&list1
# 2       G1      G2 list1&list2
# 3       G1      G4 list1&list3
# 4       G1      G5 list1&list3
# 5       G1      G6 list1&list3
# 6       G3      G4 list1&list3
# 7       G3      G5 list1&list3
# 8       G3      G6 list1&list3
# 9       G2      G3 list2&list1
# 10      G2      G4 list2&list3
# 11      G2      G5 list2&list3
# 12      G2      G6 list2&list3
# 13      G4      G5 list3&list3
# 14      G4      G6 list3&list3
# 15      G5      G6 list3&list3

Or just data.table (won't scale well)

out = dt[, CJ(GA = V1, GB = V1)][GA < GB]
out[, froma := dt[.SD, on = .(V1 = GA), list_from]]
out[, fromb := dt[.SD, on = .(V1 = GB), list_from]]
out[, c('froma', 'fromb', 'lists') := .(NULL, NULL, paste(froma, fromb, sep = "&"))]

#         GA     GB       lists
#     <char> <char>      <char>
#  1:     G1     G2 list1&list2
#  2:     G1     G3 list1&list1
#  3:     G1     G4 list1&list3
#  4:     G1     G5 list1&list3
#  5:     G1     G6 list1&list3
#  6:     G2     G3 list2&list1
#  7:     G2     G4 list2&list3
#  8:     G2     G5 list2&list3
#  9:     G2     G6 list2&list3
# 10:     G3     G4 list1&list3
# 11:     G3     G5 list1&list3
# 12:     G3     G6 list1&list3
# 13:     G4     G5 list3&list3
# 14:     G4     G6 list3&list3
# 15:     G5     G6 list3&list3
  • Related