Home > front end >  Combine list of monthly named list of dataframes into one yearly name list of dataframes using purrr
Combine list of monthly named list of dataframes into one yearly name list of dataframes using purrr

Time:03-15

Every month I create a named list of dataframes with the same names.

I would like to use purrr to combine the lists of a number of months as to get one list of dataframes containing in each dataframe the data for all months.

(in the repexp below the data in the different dataframes is the same for all months, that's obviously not the case in the real data)

#create some data

method1 <- read.csv(text="input,output,result
A,a,1
A,b,3
B,b,8
A,d,11
") 

method2 <- read.csv(text="rainy_days, count
1,4
3,3
5,8
7,10
") 


method3 <- read.csv(text="in,out,rslt
A,a,6
A,b,5
B,b,1
A,d,12
") 

method4 <- read.csv(text="input,output,result
A,a,1
C,c,4
") 

The monthly named lists

month1 <- list(method1=method1,method2=method2,method3=method3)
month2 <- list(method1=method1,method2=method2,method3=method3)
month3 <- list(method1=method1,method3=method3)
month4 <- list(method1=method1,method2=method2,method3=method3, method4=method4)

# > str(month1)
# List of 3
# $ method1:'data.frame':   4 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "A" "B" "A"
# ..$ ouput : chr [1:4] "a" "b" "b" "d"
# ..$ result: int [1:4] 1 3 8 11
# $ method2:'data.frame':   4 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "A" "B" "A"
# ..$ ouput : chr [1:4] "a" "b" "b" "d"
# ..$ result: int [1:4] 4 3 8 10
# $ method3:'data.frame':   4 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "A" "B" "A"
# ..$ ouput : chr [1:4] "a" "b" "b" "d"
# ..$ result: int [1:4] 6 5 1 12

I would like to replace

current_year_method1 <- bind_rows(month1[[1]], month2[[1]],month3[[1]],month4[[1]])
current_year_method2 <-bind_rows(month1[[2]], month2[[2]],month4[[2]])
current_year_method3 <-bind_rows(month1[[3]], month2[[3]],month3[[3]],month4[[3]])
current_year_method4 <-bind_rows(month4[[4]])

year_all_data <- list(method1=current_year_method1,method2=current_year_method2,method3=current_year_method3,method4=current_year_method4)

with a more generic solution starting from a list of months.

Some workaround to get there is flatten the list and then use one of the solutions proposed here

year_list_flat <- flatten(list(month1,month2,month3,month4))

all_year <- year_list_flat %>% split(names(.)) %>% map(bind_rows)

# > str(all_year)
# List of 4
# $ method1:'data.frame':   16 obs. of  3 variables:
#   ..$ input : chr [1:16] "A" "A" "B" "A" ...
# ..$ ouput : chr [1:16] "a" "b" "b" "d" ...
# ..$ result: int [1:16] 1 3 8 11 1 3 8 11 1 3 ...
# $ method2:'data.frame':   12 obs. of  3 variables:
#   ..$ input : chr [1:12] "A" "A" "B" "A" ...
# ..$ ouput : chr [1:12] "a" "b" "b" "d" ...
# ..$ result: int [1:12] 4 3 8 10 4 3 8 10 4 3 ...
# $ method3:'data.frame':   16 obs. of  3 variables:
#   ..$ input : chr [1:16] "A" "A" "B" "A" ...
# ..$ ouput : chr [1:16] "a" "b" "b" "d" ...
# ..$ result: int [1:16] 6 5 1 12 6 5 1 12 6 5 ...
# $ method4:'data.frame':   2 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "C"
# ..$ ouput : chr [1:4] "a" "c"
# ..$ result: int [1:4] 1 4

but then I lose the possibility to use the name of the monthly file as an indicator variable due to the use of flatten.

Conceptually it is also different because you combine first all lists into a single list and next combine the dataframes.

Is there an elegant way that does not use flatten?

CodePudding user response:

Here's a solution with bind_rows and map(x, bind_rows)

library(tidyverse)
  
# create some data

method1 <- read.csv(text="input,output,result
A,a,1
A,b,3
B,b,8
A,d,11
") 

method2 <- read.csv(text="input,output,result
A,a,4
A,b,3
B,b,8
A,d,10
") 


method3 <- read.csv(text="input,output,result
A,a,6
A,b,5
B,b,1
A,d,12
") 

method4 <- read.csv(text="input,output,result
A,a,1
A,b,2
B,b,3
C,c,4
") 

# The monthly named lists

month1 <- list(method1=method1,method2=method2,method3=method3)
month2 <- list(method1=method1,method2=method2,method3=method3)
month3 <- list(method1=method1,method3=method3)
month4 <- list(method1=method1,method2=method2,method3=method3, method4=method4)

# for one of these lists, can bind_rows
bind_rows(month1, .id = "month")
#>      month input output result
#> 1  method1     A      a      1
#> 2  method1     A      b      3
#> 3  method1     B      b      8
#> 4  method1     A      d     11
#> 5  method2     A      a      4
#> 6  method2     A      b      3
#> 7  method2     B      b      8
#> 8  method2     A      d     10
#> 9  method3     A      a      6
#> 10 method3     A      b      5
#> 11 method3     B      b      1
#> 12 method3     A      d     12

# for a list of lists, bind rows within each element (methods in months), then bind again (months)
df <- lst(month1, month2, month3, month4) %>% 
  map(bind_rows, .id = "method") %>%   
  bind_rows(.id = "month")             
print(df)
#>     month  method input output result
#> 1  month1 method1     A      a      1
#> 2  month1 method1     A      b      3
#> 3  month1 method1     B      b      8
#> 4  month1 method1     A      d     11
#> 5  month1 method2     A      a      4
#> 6  month1 method2     A      b      3
#> 7  month1 method2     B      b      8
#> 8  month1 method2     A      d     10
#> 9  month1 method3     A      a      6
#> 10 month1 method3     A      b      5
#> 11 month1 method3     B      b      1
#> 12 month1 method3     A      d     12
#> 13 month2 method1     A      a      1
#> 14 month2 method1     A      b      3
#> 15 month2 method1     B      b      8
#> 16 month2 method1     A      d     11
#> 17 month2 method2     A      a      4
#> 18 month2 method2     A      b      3
#> 19 month2 method2     B      b      8
#> 20 month2 method2     A      d     10
#> 21 month2 method3     A      a      6
#> 22 month2 method3     A      b      5
#> 23 month2 method3     B      b      1
#> 24 month2 method3     A      d     12
#> 25 month3 method1     A      a      1
#> 26 month3 method1     A      b      3
#> 27 month3 method1     B      b      8
#> 28 month3 method1     A      d     11
#> 29 month3 method3     A      a      6
#> 30 month3 method3     A      b      5
#> 31 month3 method3     B      b      1
#> 32 month3 method3     A      d     12
#> 33 month4 method1     A      a      1
#> 34 month4 method1     A      b      3
#> 35 month4 method1     B      b      8
#> 36 month4 method1     A      d     11
#> 37 month4 method2     A      a      4
#> 38 month4 method2     A      b      3
#> 39 month4 method2     B      b      8
#> 40 month4 method2     A      d     10
#> 41 month4 method3     A      a      6
#> 42 month4 method3     A      b      5
#> 43 month4 method3     B      b      1
#> 44 month4 method3     A      d     12
#> 45 month4 method4     A      a      1
#> 46 month4 method4     A      b      2
#> 47 month4 method4     B      b      3
#> 48 month4 method4     C      c      4

Created on 2022-03-14 by the reprex package (v2.0.1)

CodePudding user response:

If I understood your problem correctly, one possible solution is to use the plyr packages in combination with the purrr package. I suppose this part as "given":

month1 <- list(method1=method1,method2=method2,method3=method3)
month2 <- list(method1=method1,method2=method2,method3=method3)
month3 <- list(method1=method1,method3=method3)
month4 <- list(method1=method1,method2=method2,method3=method3, method4=method4)

Which we can combine to a named list (in your monthly run code you would just increment a list item):

d <- list(month1 = month1, month2 = month2, month3 = month3, month4 = month4) 

this gives you a two level deep, named list. It is important for the list to be named after method on the lowest and month on the highest level for the ldply function to work (names becomes new id field):

library(dplyr)
library(plyr)

# first call use purr to transform sub lists do df with method as new column name
ir <- purrr::map(d, ~plyr::ldply(.x, .id ="method")) %>% 
    # secondly convert the new one level list to one df with month as new column
    plyr::ldply(.id = "month")

    month  method input output result
1  month1 method1     A      a      1
2  month1 method1     A      b      3
3  month1 method1     B      b      8
4  month1 method1     A      d     11
5  month1 method2     A      a      4
6  month1 method2     A      b      3
7  month1 method2     B      b      8
(some lines where omitted)

# you can group and split by grouping into lists
ir %>% 
    # build the grouping to split by into list and use group_split
    dplyr::group_by(method) %>% 
    dplyr::group_split()

[[1]]
# A tibble: 16 x 5
   month  method  input output result
   <fct>  <fct>   <chr> <chr>   <int>
 1 month1 method1 A     a           1
 2 month1 method1 A     b           3
 3 month1 method1 B     b           8
 4 month1 method1 A     d          11
 5 month2 method1 A     a           1
 6 month2 method1 A     b           3
 7 month2 method1 B     b           8
 8 month2 method1 A     d          11
 (lines and further listt items omitted)

EDIT

since the input data was altered from list of dfs with the same colums to list of dfs with diferent columns, here goes one aproach using the logic show above. This time though we have to transform all columns to character for it to work, meaning the output will be all character as well, but that should be easily solvable given the data structure. The modification to the first version is tranforming all dfs to long format, so that the row names become a new column and we have only one column of values. In the end we have to make the data wider to get the original format. All we need is a little helper to identify df rows, which is the row number:

res <- purrr::map(d, ~purrr::map(.x, ~ dplyr::mutate(.x,
                                                     dplyr::across(dplyr::everything(), ~ as.character(.x)), 
                                                     rn = dplyr::row_number()) %>% 
                                           tidyr::pivot_longer(-rn)) %>%
                                           plyr::ldply(.id = "method")) %>% 
    plyr::ldply(.id = "month") %>%  
    dplyr::group_by(method) %>% 
    dplyr::group_split() %>%
    purrr::map(~ .x %>% 
                     tidyr::pivot_wider(names_from = "name", values_from = "value") %>%
                     dplyr::select(-rn))
# name the list items cording to unique method colum of list item dfs
names(res) <- purrr::map_chr(res, ~ unique(as.character(.x$method)))

res

$method1
# A tibble: 16 x 5
   month  method  input output result
   <fct>  <fct>   <chr> <chr>  <chr> 
 1 month1 method1 A     a      1     
 2 month1 method1 A     b      3     
 3 month1 method1 B     b      8     
 4 month1 method1 A     d      11    
(many lines and other list item omitted)
  • Related