I have set my working directory to a directory containing two folders with three files each, and they have the same names as each other: SFM01_2, SFM05_2, 02M08_2. I want to row bind the files with the same name and bring all three combined data frames into the environment. The code I have tried so far works for the first file name in the list, but stops after that. Here is the code
setwd("C:/Users/Christopher/Desktop/R_Directory/processing/test")
filelist <- list.files(recursive=TRUE)
list_of_all_sites <- c("SFM01_2", "SFM05_2", "02M08_2")
for (j in list_of_all_sites) {
i_files <- which(str_detect(filelist, j))
for (index in i_files){
df_index <- read_csv(filelist[index])
if (index == i_files[1]){
df_site <- df_index
} else {
df_site <- bind_rows(df_site, df_index)
}
}
}
CodePudding user response:
I suggest one of two paths, both predicated on using list.files(.., full.names=TRUE)
.
First, reproducible setup:
dir.create("dir1")
dir.create("dir2")
write.csv(mtcars[1:2,], "dir1/SFM01_2.csv", row.names=FALSE)
write.csv(mtcars[3:4,], "dir1/SFM05_2.csv", row.names=FALSE)
write.csv(mtcars[5:6,], "dir1/02M08_2.csv", row.names=FALSE)
write.csv(mtcars[7:8,], "dir2/SFM01_2.csv", row.names=FALSE)
write.csv(mtcars[9:10,], "dir2/SFM05_2.csv", row.names=FALSE)
write.csv(mtcars[11:12,], "dir2/02M08_2.csv", row.names=FALSE)
files <- list.files(c("dir1", "dir2"), "\\.csv$", full.names = TRUE, recursive = TRUE)
files
# [1] "dir1/02M08_2.csv" "dir1/SFM01_2.csv" "dir1/SFM05_2.csv" "dir2/02M08_2.csv"
# [5] "dir2/SFM01_2.csv" "dir2/SFM05_2.csv"
One Frame To Rule Them All
If all of the files are the same structure (same column names, regardless of the number of rows), then putting it all in one frame and doing analysis based on grouped operations (e.g., dplyr::group_by
or data.table
's [, by=]
).
alldat <- lapply(setNames(nm = files), read.csv)
out1 <- do.call(rbind, Map(function(x, nm) transform(x, filename = nm), alldat, names(alldat)))
rownames(out1) <- NULL # I dislike the default row names here
out1
# mpg cyl disp hp drat wt qsec vs am gear carb filename
# 1 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 dir1/02M08_2.csv
# 2 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 dir1/02M08_2.csv
# 3 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 dir1/SFM01_2.csv
# 4 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 dir1/SFM01_2.csv
# 5 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 dir1/SFM05_2.csv
# 6 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 dir1/SFM05_2.csv
# 7 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 dir2/02M08_2.csv
# 8 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 dir2/02M08_2.csv
# 9 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 dir2/SFM01_2.csv
# 10 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 dir2/SFM01_2.csv
# 11 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 dir2/SFM05_2.csv
# 12 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 dir2/SFM05_2.csv
If the filename "base" name is important, then you can reassign this into the frame as well.
basename(out1$filename)
# [1] "02M08_2.csv" "02M08_2.csv" "SFM01_2.csv" "SFM01_2.csv" "SFM05_2.csv" "SFM05_2.csv" "02M08_2.csv" "02M08_2.csv" "SFM01_2.csv"
# [10] "SFM01_2.csv" "SFM05_2.csv" "SFM05_2.csv"
out1$basename <- basename(out1$filename)
Alternatives to the do.call(rbind, ...)
are available, use either if you prefer.
out1 <- dplyr::bind_rows(alldat, .id = "filename")
out1 <- data.table::rbindlist(alldat, idcol = "filename")
Split By Name
If they are not the same structure, then let's split them before reading them in.
split(files, basename(files))
# $`02M08_2.csv`
# [1] "dir1/02M08_2.csv" "dir2/02M08_2.csv"
# $SFM01_2.csv
# [1] "dir1/SFM01_2.csv" "dir2/SFM01_2.csv"
# $SFM05_2.csv
# [1] "dir1/SFM05_2.csv" "dir2/SFM05_2.csv"
lapply(split(files, basename(files)), function(fewerfiles) {
out <- do.call(rbind, lapply(setNames(nm = fewerfiles), read.csv))
rownames(out) <- NULL
out
})
# $`02M08_2.csv`
# mpg cyl disp hp drat wt qsec vs am gear carb
# 1 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
# 2 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1
# 3 17.8 6 167.6 123 3.92 3.44 18.90 1 0 4 4
# 4 16.4 8 275.8 180 3.07 4.07 17.40 0 0 3 3
# $SFM01_2.csv
# mpg cyl disp hp drat wt qsec vs am gear carb
# 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
# 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
# 3 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# 4 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
# $SFM05_2.csv
# mpg cyl disp hp drat wt qsec vs am gear carb
# 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
# 2 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
# 3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
# 4 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
CodePudding user response:
Have you tried {purrr}
package? It would be something like this:
library(purrr)
list_of_all_sites <- c("SFM01_2", "SFM05_2", "02M08_2")
list_of_all_dataframes <- map(
list_of_all_sites,
~ list.files(pattern = .x, recursive = TRUE) %>%
map_df(read_csv)
)
CodePudding user response:
It is hard to reproduce your example so I am taking a guess without being able to verify.
In your code the variable df_site
gets overwritten in every step of the outer for loop. I would therefore believe that in the end it does not contain the content of the first file but of file three.
To fix your code insert the command df_site <- list()
before the outer for loop. This will initialize df_site
as an empty list that can then be populated step by step in the for loop. To achieve this replace df_site
by df_site[[j]]
everywhere inside the outer for loop.
df_site <- list()
setwd("C:/Users/Christopher/Desktop/R_Directory/processing/test")
filelist <- list.files(recursive=TRUE)
list_of_all_sites <- c("SFM01_2", "SFM05_2", "02M08_2")
for (j in list_of_all_sites) {
i_files <- which(str_detect(filelist, j))
for (index in i_files){
df_index <- read_csv(filelist[index])
if (index == i_files[1]){
df_site[[j]] <- df_index
} else {
df_site[[j]] <- bind_rows(df_site[[j]], df_index)
}
}
}
After this df_site
will be a list with three components with each component holding the content of one unified file.