How to row bind data frames with same name from different subfolders in working directory-CodePudding

I have set my working directory to a directory containing two folders with three files each, and they have the same names as each other: SFM01_2, SFM05_2, 02M08_2. I want to row bind the files with the same name and bring all three combined data frames into the environment. The code I have tried so far works for the first file name in the list, but stops after that. Here is the code

setwd("C:/Users/Christopher/Desktop/R_Directory/processing/test")
filelist <- list.files(recursive=TRUE)
list_of_all_sites <- c("SFM01_2", "SFM05_2", "02M08_2")

for (j in list_of_all_sites) {
  i_files <- which(str_detect(filelist, j))
  
  for (index in i_files){
    df_index <- read_csv(filelist[index])
    
    if (index == i_files[1]){
      df_site <- df_index
    } else {
      df_site <- bind_rows(df_site, df_index)
    }
    
  }

}

CodePudding user response：

I suggest one of two paths, both predicated on using list.files(.., full.names=TRUE).

First, reproducible setup:

dir.create("dir1")
dir.create("dir2")
write.csv(mtcars[1:2,], "dir1/SFM01_2.csv", row.names=FALSE)
write.csv(mtcars[3:4,], "dir1/SFM05_2.csv", row.names=FALSE)
write.csv(mtcars[5:6,], "dir1/02M08_2.csv", row.names=FALSE)
write.csv(mtcars[7:8,], "dir2/SFM01_2.csv", row.names=FALSE)
write.csv(mtcars[9:10,], "dir2/SFM05_2.csv", row.names=FALSE)
write.csv(mtcars[11:12,], "dir2/02M08_2.csv", row.names=FALSE)

files <- list.files(c("dir1", "dir2"), "\\.csv$", full.names = TRUE, recursive = TRUE)
files
# [1] "dir1/02M08_2.csv" "dir1/SFM01_2.csv" "dir1/SFM05_2.csv" "dir2/02M08_2.csv"
# [5] "dir2/SFM01_2.csv" "dir2/SFM05_2.csv"

One Frame To Rule Them All

If all of the files are the same structure (same column names, regardless of the number of rows), then putting it all in one frame and doing analysis based on grouped operations (e.g., dplyr::group_by or data.table's [, by=]).

alldat <- lapply(setNames(nm = files), read.csv)
out1 <- do.call(rbind, Map(function(x, nm) transform(x, filename = nm), alldat, names(alldat)))
rownames(out1) <- NULL # I dislike the default row names here
out1
#     mpg cyl  disp  hp drat    wt  qsec vs am gear carb         filename
# 1  18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2 dir1/02M08_2.csv
# 2  18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1 dir1/02M08_2.csv
# 3  21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4 dir1/SFM01_2.csv
# 4  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4 dir1/SFM01_2.csv
# 5  22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1 dir1/SFM05_2.csv
# 6  21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1 dir1/SFM05_2.csv
# 7  17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4 dir2/02M08_2.csv
# 8  16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3 dir2/02M08_2.csv
# 9  14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4 dir2/SFM01_2.csv
# 10 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2 dir2/SFM01_2.csv
# 11 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2 dir2/SFM05_2.csv
# 12 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4 dir2/SFM05_2.csv

If the filename "base" name is important, then you can reassign this into the frame as well.

basename(out1$filename)
#  [1] "02M08_2.csv" "02M08_2.csv" "SFM01_2.csv" "SFM01_2.csv" "SFM05_2.csv" "SFM05_2.csv" "02M08_2.csv" "02M08_2.csv" "SFM01_2.csv"
# [10] "SFM01_2.csv" "SFM05_2.csv" "SFM05_2.csv"
out1$basename <- basename(out1$filename)

Alternatives to the do.call(rbind, ...) are available, use either if you prefer.

out1 <- dplyr::bind_rows(alldat, .id = "filename")
out1 <- data.table::rbindlist(alldat, idcol = "filename")

Split By Name

If they are not the same structure, then let's split them before reading them in.

split(files, basename(files))
# $`02M08_2.csv`
# [1] "dir1/02M08_2.csv" "dir2/02M08_2.csv"
# $SFM01_2.csv
# [1] "dir1/SFM01_2.csv" "dir2/SFM01_2.csv"
# $SFM05_2.csv
# [1] "dir1/SFM05_2.csv" "dir2/SFM05_2.csv"
lapply(split(files, basename(files)), function(fewerfiles) {
  out <- do.call(rbind, lapply(setNames(nm = fewerfiles), read.csv))
  rownames(out) <- NULL
  out
})
# $`02M08_2.csv`
#    mpg cyl  disp  hp drat   wt  qsec vs am gear carb
# 1 18.7   8 360.0 175 3.15 3.44 17.02  0  0    3    2
# 2 18.1   6 225.0 105 2.76 3.46 20.22  1  0    3    1
# 3 17.8   6 167.6 123 3.92 3.44 18.90  1  0    4    4
# 4 16.4   8 275.8 180 3.07 4.07 17.40  0  0    3    3
# $SFM01_2.csv
#    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
# 1 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
# 2 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
# 3 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
# 4 24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
# $SFM05_2.csv
#    mpg cyl  disp  hp drat    wt  qsec vs am gear carb
# 1 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
# 2 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
# 3 22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
# 4 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4

CodePudding user response：

Have you tried {purrr} package? It would be something like this:

library(purrr)

list_of_all_sites <- c("SFM01_2", "SFM05_2", "02M08_2")

list_of_all_dataframes <- map(
  list_of_all_sites,
  ~ list.files(pattern = .x, recursive = TRUE) %>%
    map_df(read_csv)
)

CodePudding user response：

It is hard to reproduce your example so I am taking a guess without being able to verify.

In your code the variable df_site gets overwritten in every step of the outer for loop. I would therefore believe that in the end it does not contain the content of the first file but of file three.

To fix your code insert the command df_site <- list() before the outer for loop. This will initialize df_site as an empty list that can then be populated step by step in the for loop. To achieve this replace df_site by df_site[[j]] everywhere inside the outer for loop.

df_site <- list()  
setwd("C:/Users/Christopher/Desktop/R_Directory/processing/test")
filelist <- list.files(recursive=TRUE)
list_of_all_sites <- c("SFM01_2", "SFM05_2", "02M08_2")

for (j in list_of_all_sites) {
  i_files <- which(str_detect(filelist, j))

  for (index in i_files){
    df_index <- read_csv(filelist[index])

    if (index == i_files[1]){
      df_site[[j]] <- df_index
    } else {
      df_site[[j]] <- bind_rows(df_site[[j]], df_index)
    }

  }

}

After this df_site will be a list with three components with each component holding the content of one unified file.