I would like to :
Read the list of * .txt files from the directory
For all the .txt files in my folder, I would like to use the information contained in all the rows id=NAME, part of the fifth column in all the *.txt files (e.g., Hox.txt and zinc.txt below)
Determine which family the given file belongs to (e.g cram-2) using a separate lookup table linking id values to a Family value (e.g., Lookup Table below)
Combine/concatenate all the files with the same family (e.g HOX.txt and zinc.txt) into one .txt file.
Save the linked file with the name of the column Family (e.g cram-2.txt).
!! REMINDER !! !! R WILL HAVE TO WORK ON A LOT AND HEAVY .txt FILES - NEED OF A TEMPORARY DIRECTORY? !!
Example:
The HOX.txt file rows :
ma reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
se reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
to reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
pa reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
The zinc.txt file rows :
ma reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
se reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
to reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
pa reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
The lookup table :
Name Family
HOX cram-2
zinc cram-2
fire sf.xr
fire ra.XS-2
...continues...
The final output I search to obtain :
a) File name = cram-2.txt
b) Concatenate HOX.txt and zinc.txt because both are from Family cram-2!
ma reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
se reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
to reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
pa reg out fim id=HOX;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
ma reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
se reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
to reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
pa reg out fim id=zinc;seq=AGCAGGAAATA;score=12.1915;pval=4.97e-05
Code working on few files.bed only. I tried using tempfile() and tempdir() but still didn't worked.
dfNameFamily = tibble(
Name = as.character(df$Name),
Family = as.character(df$Family)
dir = "~/textfiles"
TxtFile = function(dir) dir_ls(dir, regexp = "\\.txt$")
readTxt = function(FileName){
lines = character()
if(file_exists(FileName)){
con = file(FileName, open = "r")
lines = readLines(con)
close(con)
}
lines
}
GetName = function(l) str_match(l, ";id=(. );seq")[1,2]
SaveFile = function(l, name, dir){
con = file(paste0(dir, "/" , name))
writeLines(unlist(l$lines), con)
close(con)
}
tibble(FileName = TxtFile(dir)) %>%
mutate(
lines = map(FileName, readTxt),
Name = map_chr(lines, GetName)) %>%
left_join(dfNameFamily, by="Name") %>%
group_by(Family) %>%
group_walk(SaveFile, dir)
CodePudding user response:
Try this code.
Gather the files to be processed in the bedDir
directory. Create a directory bedOut
for the output files. It's best to keep this in one RStudio project where you create these directories.
Extend the Names
and Families
vectors accordingly according to your needs.
library(data.table)
library(tidyverse)
library(fs)
library(utils)
Names = c("HOX", "zinc", "fire", "fire2")
Families = c("cram-2", "cram-2", "sf.xr", "ra.XS-2")
GetName = function(l) str_match(l, "id=(. );seq")[1,2]
GetFamily = function(l) Families[which(Names==GetName(l))]
BedFile = function(dir) dir_ls(dir, regexp = "\\.bed$")
info = function(txt, start_time, end_time){
tdif = end_time - start_time
tunit = " ms"
if(tdif>=1000){
tdif = tdif/1000
tunit = " s"
}
cat(paste0("Read ", txt," (", round(tdif,2),tunit,")\n"))
}
time_ms = function() as.numeric(as.numeric(Sys.time())*1000, digits=15)
pbTitle = function(path, files, i){
paste("[", round(i/length(files)*100, 0),"%]",
"files have been read from the", path, "directory.",
"The file being read:", files[i])}
bedDir = "bedDir"
bedOut = "bedOut"
files = BedFile(bedDir)
if(length(files)>0){
start_time = time_ms()
pb = winProgressBar(max = length(files), width = 500)
on.exit(close(pb), add = TRUE)
for(i in 1:length(files)){
setWinProgressBar(pb, i-1, pbTitle(bedDir, files, i-1))
lines = fread(text = files[i], sep = "|", header=FALSE)
Name = GetName(lines[1])
Family = GetFamily(lines[1])
fileName = paste0(bedOut, "/", Family, ".bad")
fwrite(lines, fileName, append = file_exists(fileName))
}
close(pb)
info(paste(length(files), "files"), start_time, time_ms())
}
P.S.
I recommend that you note that I used the fastest known functions for reading and writing fread
and fwrite
text files from the data.table
package here.
Let me know if that's it and how fast it works.