I want to filter paths matching only some values ( "chr1" "chr11" "chr16" "chr17" "chr2" "chr5" "chr6" "chr7") in a list of paths. However my results includes additional chr#
This is the items i want to filter
> sort(chrm_to_filter$chr)
"chr1" "chr11" "chr16" "chr17" "chr2" "chr5" "chr6" "chr7"
My data looks something like this
print(path_per_chr_tabix)
"/path_to_file/merged_modified_per_base_calling.chr1.bgz"
"/path_to_file/merged_modified_per_base_calling.chr10.bgz"
"/path_to_file/merged_modified_per_base_calling.chr11.bgz"
"/path_to_file/merged_modified_per_base_calling.chr12.bgz"
"/path_to_file/merged_modified_per_base_calling.chr13.bgz"
"/path_to_file/merged_modified_per_base_calling.chr14.bgz"
"/path_to_file/merged_modified_per_base_calling.chr15.bgz"
"/path_to_file/merged_modified_per_base_calling.chr16.bgz"
"/path_to_file/merged_modified_per_base_calling.chr17.bgz"
"/path_to_file/merged_modified_per_base_calling.chr18.bgz"
"/path_to_file/merged_modified_per_base_calling.chr19.bgz"
"/path_to_file/merged_modified_per_base_calling.chr2.bgz"
"/path_to_file/merged_modified_per_base_calling.chr3.bgz"
"/path_to_file/merged_modified_per_base_calling.chr4.bgz"
"/path_to_file/merged_modified_per_base_calling.chr5.bgz"
"/path_to_file/merged_modified_per_base_calling.chr6.bgz"
"/path_to_file/merged_modified_per_base_calling.chr7.bgz"
"/path_to_file/merged_modified_per_base_calling.chr8.bgz"
"/path_to_file/merged_modified_per_base_calling.chr9.bgz"
"/path_to_file/merged_modified_per_base_calling.chrm.bgz"
"/path_to_file/merged_modified_per_base_calling.chrX.bgz"
"/path_to_file/merged_modified_per_base_calling.chrY.bgz"
#find which data to load to save memory
subset_tabix_paths_to_load <- path_per_chr_tabix[
grep(
paste0(sort(chrm_to_filter$chr), collapse="|"),
path_per_chr_tabix)
]
message("these are the files we will be workign with for now- ")
print(subset_tabix_paths_to_load)
"/paths/merged_modified_per_base_calling.chr1.bgz"
"/paths/merged_modified_per_base_calling.chr10.bgz"
"/paths/merged_modified_per_base_calling.chr11.bgz"
"/paths/merged_modified_per_base_calling.chr12.bgz"
"/paths/merged_modified_per_base_calling.chr13.bgz"
"/paths/merged_modified_per_base_calling.chr14.bgz"
"/paths/merged_modified_per_base_calling.chr15.bgz"
"/paths/merged_modified_per_base_calling.chr16.bgz"
"/paths/merged_modified_per_base_calling.chr17.bgz"
"/paths/merged_modified_per_base_calling.chr18.bgz"
"/paths/merged_modified_per_base_calling.chr19.bgz"
"/paths/merged_modified_per_base_calling.chr2.bgz"
"/paths/merged_modified_per_base_calling.chr5.bgz"
"/paths/merged_modified_per_base_calling.chr6.bgz"
"/paths/merged_modified_per_base_calling.chr7.bgz"
CodePudding user response:
The problem is that one of your search strings is 'chr1', which is found in 'chr1', 'chr10', 'chr11', etc. If you change your search strings to 'chr1.bgz' etc, it should work.
CodePudding user response:
Here's an approach using sub
which
> filter_chr <- c("chr1", "chr11", "chr16", "chr17", "chr2", "chr5", "chr6", "chr7")
> string[which(sub(".*\\.(chr\\d )\\..*$", "\\1", string) %in% filter_chr)]
[1] "/path_to_file/merged_modified_per_base_calling.chr1.bgz"
[2] "/path_to_file/merged_modified_per_base_calling.chr11.bgz"
[3] "/path_to_file/merged_modified_per_base_calling.chr16.bgz"
[4] "/path_to_file/merged_modified_per_base_calling.chr17.bgz"
[5] "/path_to_file/merged_modified_per_base_calling.chr2.bgz"
[6] "/path_to_file/merged_modified_per_base_calling.chr5.bgz"
[7] "/path_to_file/merged_modified_per_base_calling.chr6.bgz"
[8] "/path_to_file/merged_modified_per_base_calling.chr7.bgz"
string
has the following structure:
c("/path_to_file/merged_modified_per_base_calling.chr1.bgz",
"/path_to_file/merged_modified_per_base_calling.chr10.bgz", "/path_to_file/merged_modified_per_base_calling.chr11.bgz",
"/path_to_file/merged_modified_per_base_calling.chr12.bgz", "/path_to_file/merged_modified_per_base_calling.chr13.bgz",
"/path_to_file/merged_modified_per_base_calling.chr14.bgz", "/path_to_file/merged_modified_per_base_calling.chr15.bgz",
"/path_to_file/merged_modified_per_base_calling.chr16.bgz", "/path_to_file/merged_modified_per_base_calling.chr17.bgz",
"/path_to_file/merged_modified_per_base_calling.chr18.bgz", "/path_to_file/merged_modified_per_base_calling.chr19.bgz",
"/path_to_file/merged_modified_per_base_calling.chr2.bgz", "/path_to_file/merged_modified_per_base_calling.chr3.bgz",
"/path_to_file/merged_modified_per_base_calling.chr4.bgz", "/path_to_file/merged_modified_per_base_calling.chr5.bgz",
"/path_to_file/merged_modified_per_base_calling.chr6.bgz", "/path_to_file/merged_modified_per_base_calling.chr7.bgz",
"/path_to_file/merged_modified_per_base_calling.chr8.bgz", "/path_to_file/merged_modified_per_base_calling.chr9.bgz",
"/path_to_file/merged_modified_per_base_calling.chrm.bgz", "/path_to_file/merged_modified_per_base_calling.chrX.bgz",
"/path_to_file/merged_modified_per_base_calling.chrY.bgz")
CodePudding user response:
lapply(sort(chrm_to_filter$chr),
function(chr) {
path_per_chr_tabix[grep(paste0(chr,"."),
path_per_chr_tabix,
fixed = TRUE)]
})|>
unlist()
#[1] "/path_to_file/merged_modified_per_base_calling.chr1.bgz"
#[2] "/path_to_file/merged_modified_per_base_calling.chr11.bgz"
#[3] "/path_to_file/merged_modified_per_base_calling.chr16.bgz"
#[4] "/path_to_file/merged_modified_per_base_calling.chr17.bgz"
#[5] "/path_to_file/merged_modified_per_base_calling.chr2.bgz"
#[6] "/path_to_file/merged_modified_per_base_calling.chr5.bgz"
#[7] "/path_to_file/merged_modified_per_base_calling.chr6.bgz"
#[8] "/path_to_file/merged_modified_per_base_calling.chr7.bgz"