How would I apply a workflow from start to finish sort of like a for loop in R except with all the data files in my working directory?
My workflow has the following components
# Load libraries
library(tidyr)
library(ggplot2)
library(dplyr)
# Import data
File_1 <- as.data.frame(read.table("file_1.bed", header = FALSE, sep = "\t", stringsAsFactors = FALSE, quote = ""))
# Separate one of the columns into 2 new columns
filtered_File_1 <- separate(data = File_1, col = V3, into = c("end position", "Methylation"), sep = "\\|")
# Change NAN values to zero
new_File_1 <- filtered_File_1[- grep("NAN", filtered_File_1$Methylation), ]
filtered_File_1$Methylation[filtered_File_1$Methylation == "NAN"] <- '0'
# Change values into numeric
filtered_File_1$Methylation <- as.numeric(as.character(filtered_File_1$Methylation))
# Add New column
filtered_File_1$ID <- c("1")
# Take the average of numerical values
Average <- filtered_File_1% >%
group_by(V1) %>%
summarise(across(everything(), list(mean)))
Essentially how would I iterate the above process over and over again for all 100 datafiles? The reason is that individually its a bit tedious to copy and paste the lines of code and it'll make the R script very long as well. I know how to use lapply for lists but what would allow for something much more complicated involving file imports and processing? There must be an easier way.
Thanks
CodePudding user response:
If the files fit in working memory, we can use a lapply
to achieve the same as a for-loop, or since the post uses tidyverse
, purrr::map
. This results in a list of data frames with summary statistics returned by function get_means
.
library(tidyverse)
files <- as.list(list.files(pattern = ".bed")) %>%
set_names(nm = sub(pattern = ".bed$", "", .))
get_means <- function(x){
as.data.frame(read.table(x)) %>%
separate(col = V3, into = c("end position", "Methylation"), sep = "\\|") %>%
mutate(Methylation = ifelse(Methylation == "NAN", "0", Methylation),
Methylation = as.numeric(Methylation)) %>%
group_by(V1) %>%
summarise(across(everything(), list(mean)))
}
lapply(files, get_means) # map(files, get_means)
or in a for-loop:
files <- list.files(pattern = ".bed") %>%
set_names(sub(pattern = ".bed$", "", .))
result <- vector(mode = "list", length(files)) %>% #initialize list
set_names(names(files))
for (i in seq_along(files)) {
result[[i]] <- get_means(files[i])
}
To write new files of summary statistics to disk directly, say with name filename_stats.csv:
files <- list.files(pattern = ".bed") %>%
set_names(sub(pattern = ".bed$", "", .))
new_file_names <- paste0(names(files), "_stats.csv")
for (i in seq_along(files)) {
write_csv(get_means(files[i]), file = new_file_names[i])
}
Example:
Consider the following reproducible example - using mtcars
and iris
where we can create a named list with summary statistics:
write.csv(mtcars, file = "mtcars.csv")
write.csv(iris, file = "iris.csv")
files <- as.list(list.files(pattern = ".csv")) %>%
set_names(nm = sub(pattern = ".csv$", "", .))
lapply(files, \(x){
read.csv(x) %>%
summarise(across(where(is.numeric), mean))
})
$iris
X Sepal.Length Sepal.Width Petal.Length Petal.Width
1 75.5 5.843333 3.057333 3.758 1.199333
$mtcars
mpg cyl disp hp drat wt qsec vs am gear carb
1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375 0.40625 3.6875 2.8125
I'm not sure what the new_File_1
in the original code is supposed to do as it is not used.
CodePudding user response:
A snippet of code I've used before in a similar situation:
library(readr)
library(dplyr)
infolder <- "C:\\Users\\name\\in"
setwd(infolder)
csvfiles <- dir(path = infolder, pattern = "\\.bed$")
for (i in csvfiles) {
print(file.path(outfolder, i))
#your code...#
File_i <- as.data.frame(read.table(i, header = FALSE, sep="\t", stringsAsFactors=FALSE, quote=""))
filtered_File_i <- separate(data = File_i, col = V3, into = c("end position", "Methylation"), sep = "\\|")
new_File_i <- filtered_File_i[- grep("NAN", filtered_File_i$Methylation),]
filtered_File_i$Methylation[filtered_File_i$Methylation == "NAN"] <- '0'
filtered_File_i$Methylation <- as.numeric(as.character(filtered_File_i$Methylation))
filtered_File_i$ID <- i
Average <- filtered_File_i %>%
group_by(V1) %>%
summarise(across(everything(), list(mean)))
### Assign the final value to a new variable named "X_i"
### see ?assign for more information
assign(x=paste0("X_", i), value=Average)
}