Home > Enterprise >  Add column from a character vector if missing from df in a list of dfs
Add column from a character vector if missing from df in a list of dfs

Time:08-27

I have a list of dataframes that contain a subset of columns. Excluding the identifier columns (id and n), all possible column names are included in a character vector called colors. I want to make so that if a dataframe, in this list of dataframes, is missing a column from colors, I want to put those missing columns into that dataframe and make those originally-missing columns NA.

This SO post demonstrates how to do this one column at a time, but considering the size of my real dataframe, I ideally would like to do it for all columns at once. I have included a reprex below with a working function that only does one color at a time and a non-working function that contains a loop.

set.seed(123)
library(tidyverse)

n <- c(1:10)
red <- sample(1:10, 10, replace = TRUE)
blue <- sample(1:10, 10, replace = TRUE)
yellow <- sample(1:10, 10, replace = TRUE)
green <- sample(1:10, 10, replace = TRUE)
pink <- sample(1:10, 10, replace = TRUE)

df1 <- data.frame(id = "1", n, red, blue, green)
df2 <- data.frame(id = "2", n, red, blue, green, pink)
df3 <- data.frame(id = "3", n, blue, yellow, green, pink)
df4 <- data.frame(id = "4", n, blue, yellow, pink)
df5 <- data.frame(id = "5", n, green)

lst <- list(df1, df2, df3, df4, df5)

# this works but only does red
red_func <- function(x) {
  if (!'red' %in% names(x)) x <- x %>% add_column(red = NA) 
  else x
}

# failed attempted at loop
color_func <- function(z) {
  colors <- c("red", "blue", "yellow", "green", "pink")
  for (x in colors) {
    if (!x %in% names(z)) z <- z %>% add_column(x = NA) 
    else z 
    }
}

test <- lapply(lst, color_func)
print(test[[1]])
#> NULL

CodePudding user response:

We could use := with !!

color_func <- function(z) {
  colors <- c("red", "blue", "yellow", "green", "pink")
  for (x in colors) {
    if (!x %in% names(z)) z <- z %>% add_column(!!x := NA) 
    else z 
    }
    
    return(z)
}

-testing

> lapply(lst, color_func)
[[1]]
   id  n red blue green yellow pink
1   1  1   3    5    10     NA   NA
2   1  2   3    3     7     NA   NA
3   1  3  10    9     5     NA   NA
4   1  4   2    9     7     NA   NA
5   1  5   6    9     5     NA   NA
6   1  6   5    3     6     NA   NA
7   1  7   4    8     9     NA   NA
8   1  8   6   10     2     NA   NA
9   1  9   9    7     5     NA   NA
10  1 10  10   10     8     NA   NA

[[2]]
   id  n red blue green pink yellow
1   2  1   3    5    10    2     NA
2   2  2   3    3     7    1     NA
3   2  3  10    9     5    9     NA
4   2  4   2    9     7    9     NA
5   2  5   6    9     5    6     NA
6   2  6   5    3     6    5     NA
7   2  7   4    8     9    9     NA
8   2  8   6   10     2   10     NA
9   2  9   9    7     5    4     NA
10  2 10  10   10     8    6     NA

[[3]]
   id  n blue yellow green pink red
1   3  1    5      9    10    2  NA
2   3  2    3      3     7    1  NA
3   3  3    9      4     5    9  NA
4   3  4    9      1     7    9  NA
5   3  5    9      7     5    6  NA
6   3  6    3      5     6    5  NA
7   3  7    8     10     9    9  NA
8   3  8   10      7     2   10  NA
9   3  9    7      9     5    4  NA
10  3 10   10      9     8    6  NA

[[4]]
   id  n blue yellow pink red green
1   4  1    5      9    2  NA    NA
2   4  2    3      3    1  NA    NA
3   4  3    9      4    9  NA    NA
4   4  4    9      1    9  NA    NA
5   4  5    9      7    6  NA    NA
6   4  6    3      5    5  NA    NA
7   4  7    8     10    9  NA    NA
8   4  8   10      7   10  NA    NA
9   4  9    7      9    4  NA    NA
10  4 10   10      9    6  NA    NA

[[5]]
   id  n green red blue yellow pink
1   4  1    10  NA   NA     NA   NA
2   4  2     7  NA   NA     NA   NA
3   4  3     5  NA   NA     NA   NA
4   4  4     7  NA   NA     NA   NA
5   4  5     5  NA   NA     NA   NA
6   4  6     6  NA   NA     NA   NA
7   4  7     9  NA   NA     NA   NA
8   4  8     2  NA   NA     NA   NA
9   4  9     5  NA   NA     NA   NA
10  4 10     8  NA   NA     NA   NA

It can be done more easily, i.e. without a loop

colors <- c("red", "blue", "yellow", "green", "pink")
lapply(lst, function(x) {x[setdiff(colors, names(x))] <- NA; x})

CodePudding user response:

You can use data.table::rbindlist with split()

library(data.table)
split(rbindlist(lst,fill=TRUE), by="id"))

or dplyr::bind_rows() with group_split()

library(dplyr)
group_split(bind_rows(lst), id)

Output:

$`1`
    id  n red blue green pink yellow
 1:  1  1  10    4     8   NA     NA
 2:  1  2   5    5     1   NA     NA
 3:  1  3   5    7     5   NA     NA
 4:  1  4   7   10     2   NA     NA
 5:  1  5   2    5     1   NA     NA
 6:  1  6   5    3     7   NA     NA
 7:  1  7   8    6     2   NA     NA
 8:  1  8   3    1     5   NA     NA
 9:  1  9   4   10     4   NA     NA
10:  1 10   1   10     5   NA     NA

$`2`
    id  n red blue green pink yellow
 1:  2  1  10    4     8    6     NA
 2:  2  2   5    5     1   10     NA
 3:  2  3   5    7     5    4     NA
 4:  2  4   7   10     2    2     NA
 5:  2  5   2    5     1    7     NA
 6:  2  6   5    3     7    8     NA
 7:  2  7   8    6     2    7     NA
 8:  2  8   3    1     5    7     NA
 9:  2  9   4   10     4    9     NA
10:  2 10   1   10     5    8     NA

$`3`
    id  n red blue green pink yellow
 1:  3  1  NA    4     8    6      4
 2:  3  2  NA    5     1   10      6
 3:  3  3  NA    7     5    4     10
 4:  3  4  NA   10     2    2      1
 5:  3  5  NA    5     1    7      1
 6:  3  6  NA    3     7    8      7
 7:  3  7  NA    6     2    7      6
 8:  3  8  NA    1     5    7     10
 9:  3  9  NA   10     4    9      3
10:  3 10  NA   10     5    8      2

$`4`
    id  n red blue green pink yellow
 1:  4  1  NA    4    NA    6      4
 2:  4  2  NA    5    NA   10      6
 3:  4  3  NA    7    NA    4     10
 4:  4  4  NA   10    NA    2      1
 5:  4  5  NA    5    NA    7      1
 6:  4  6  NA    3    NA    8      7
 7:  4  7  NA    6    NA    7      6
 8:  4  8  NA    1    NA    7     10
 9:  4  9  NA   10    NA    9      3
10:  4 10  NA   10    NA    8      2

$`5`
    id  n red blue green pink yellow
 1:  5  1  NA   NA     8   NA     NA
 2:  5  2  NA   NA     1   NA     NA
 3:  5  3  NA   NA     5   NA     NA
 4:  5  4  NA   NA     2   NA     NA
 5:  5  5  NA   NA     1   NA     NA
 6:  5  6  NA   NA     7   NA     NA
 7:  5  7  NA   NA     2   NA     NA
 8:  5  8  NA   NA     5   NA     NA
 9:  5  9  NA   NA     4   NA     NA
10:  5 10  NA   NA     5   NA     NA

CodePudding user response:

What about completely base R approach?:

First, let's create a function that does the job for a single dataframe:

colorize_columns = function(df) {
        
        cols = c("red", "blue", "yellow", "green", "pink")
        cols_df = names(df)[-c(1, 2)] # ignore the id and n columns
        
        if (all(cols %in% cols_df)) { # do nothing if the color names match
                
                return(df)
                
        } else {

                cols_unmatched = cols[!(cols %in% cols_df)] # colors that do not match
                
                for (col in cols_unmatched) {
                        
                                df[[col]] = NA
                                
                }
                
                return(df)
        }
}

Then, we can do:

lapply(lst, colorize_columns)

Output:

[[1]]
   id  n red blue green yellow pink
1   1  1   3    5    10     NA   NA
2   1  2   3    3     7     NA   NA
3   1  3  10    9     5     NA   NA
4   1  4   2    9     7     NA   NA
5   1  5   6    9     5     NA   NA
6   1  6   5    3     6     NA   NA
7   1  7   4    8     9     NA   NA
8   1  8   6   10     2     NA   NA
9   1  9   9    7     5     NA   NA
10  1 10  10   10     8     NA   NA

[[2]]
   id  n red blue green pink yellow
1   2  1   3    5    10    2     NA
2   2  2   3    3     7    1     NA
3   2  3  10    9     5    9     NA
4   2  4   2    9     7    9     NA
5   2  5   6    9     5    6     NA
6   2  6   5    3     6    5     NA
7   2  7   4    8     9    9     NA
8   2  8   6   10     2   10     NA
9   2  9   9    7     5    4     NA
10  2 10  10   10     8    6     NA

[[3]]
   id  n blue yellow green pink red
1   3  1    5      9    10    2  NA
2   3  2    3      3     7    1  NA
3   3  3    9      4     5    9  NA
4   3  4    9      1     7    9  NA
5   3  5    9      7     5    6  NA
6   3  6    3      5     6    5  NA
7   3  7    8     10     9    9  NA
8   3  8   10      7     2   10  NA
9   3  9    7      9     5    4  NA
10  3 10   10      9     8    6  NA

[[4]]
   id  n blue yellow pink red green
1   4  1    5      9    2  NA    NA
2   4  2    3      3    1  NA    NA
3   4  3    9      4    9  NA    NA
4   4  4    9      1    9  NA    NA
5   4  5    9      7    6  NA    NA
6   4  6    3      5    5  NA    NA
7   4  7    8     10    9  NA    NA
8   4  8   10      7   10  NA    NA
9   4  9    7      9    4  NA    NA
10  4 10   10      9    6  NA    NA

[[5]]
   id  n green red blue yellow pink
1   5  1    10  NA   NA     NA   NA
2   5  2     7  NA   NA     NA   NA
3   5  3     5  NA   NA     NA   NA
4   5  4     7  NA   NA     NA   NA
5   5  5     5  NA   NA     NA   NA
6   5  6     6  NA   NA     NA   NA
7   5  7     9  NA   NA     NA   NA
8   5  8     2  NA   NA     NA   NA
9   5  9     5  NA   NA     NA   NA
10  5 10     8  NA   NA     NA   NA
  • Related