Home > Enterprise >  grep not giving specific results in R
grep not giving specific results in R

Time:12-01

I want to filter paths matching only some values ( "chr1" "chr11" "chr16" "chr17" "chr2" "chr5" "chr6" "chr7") in a list of paths. However my results includes additional chr#

This is the items i want to filter

> sort(chrm_to_filter$chr)
 "chr1"  "chr11" "chr16" "chr17" "chr2"  "chr5"  "chr6"  "chr7" 

My data looks something like this

print(path_per_chr_tabix)
"/path_to_file/merged_modified_per_base_calling.chr1.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr10.bgz"
"/path_to_file/merged_modified_per_base_calling.chr11.bgz"
"/path_to_file/merged_modified_per_base_calling.chr12.bgz"
"/path_to_file/merged_modified_per_base_calling.chr13.bgz"
"/path_to_file/merged_modified_per_base_calling.chr14.bgz"
"/path_to_file/merged_modified_per_base_calling.chr15.bgz"
"/path_to_file/merged_modified_per_base_calling.chr16.bgz"
"/path_to_file/merged_modified_per_base_calling.chr17.bgz"
"/path_to_file/merged_modified_per_base_calling.chr18.bgz"
"/path_to_file/merged_modified_per_base_calling.chr19.bgz"
"/path_to_file/merged_modified_per_base_calling.chr2.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr3.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr4.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr5.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr6.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr7.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr8.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr9.bgz" 
"/path_to_file/merged_modified_per_base_calling.chrm.bgz" 
"/path_to_file/merged_modified_per_base_calling.chrX.bgz" 
"/path_to_file/merged_modified_per_base_calling.chrY.bgz" 

#find which data to load to save memory

subset_tabix_paths_to_load <- path_per_chr_tabix[
    grep(
        paste0(sort(chrm_to_filter$chr), collapse="|"), 
        path_per_chr_tabix)
    ]
message("these are the files we will be workign with for now- ")
print(subset_tabix_paths_to_load)

"/paths/merged_modified_per_base_calling.chr1.bgz" 
"/paths/merged_modified_per_base_calling.chr10.bgz"
"/paths/merged_modified_per_base_calling.chr11.bgz"
"/paths/merged_modified_per_base_calling.chr12.bgz"
"/paths/merged_modified_per_base_calling.chr13.bgz"
"/paths/merged_modified_per_base_calling.chr14.bgz"
"/paths/merged_modified_per_base_calling.chr15.bgz"
"/paths/merged_modified_per_base_calling.chr16.bgz"
"/paths/merged_modified_per_base_calling.chr17.bgz"
"/paths/merged_modified_per_base_calling.chr18.bgz"
"/paths/merged_modified_per_base_calling.chr19.bgz"
"/paths/merged_modified_per_base_calling.chr2.bgz" 
"/paths/merged_modified_per_base_calling.chr5.bgz" 
"/paths/merged_modified_per_base_calling.chr6.bgz" 
"/paths/merged_modified_per_base_calling.chr7.bgz" 

CodePudding user response:

The problem is that one of your search strings is 'chr1', which is found in 'chr1', 'chr10', 'chr11', etc. If you change your search strings to 'chr1.bgz' etc, it should work.

CodePudding user response:

Here's an approach using sub which

> filter_chr <- c("chr1",  "chr11", "chr16", "chr17", "chr2",  "chr5",  "chr6",  "chr7") 
> string[which(sub(".*\\.(chr\\d )\\..*$", "\\1", string) %in% filter_chr)]
[1] "/path_to_file/merged_modified_per_base_calling.chr1.bgz" 
[2] "/path_to_file/merged_modified_per_base_calling.chr11.bgz"
[3] "/path_to_file/merged_modified_per_base_calling.chr16.bgz"
[4] "/path_to_file/merged_modified_per_base_calling.chr17.bgz"
[5] "/path_to_file/merged_modified_per_base_calling.chr2.bgz" 
[6] "/path_to_file/merged_modified_per_base_calling.chr5.bgz" 
[7] "/path_to_file/merged_modified_per_base_calling.chr6.bgz" 
[8] "/path_to_file/merged_modified_per_base_calling.chr7.bgz" 

string has the following structure:

c("/path_to_file/merged_modified_per_base_calling.chr1.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr10.bgz", "/path_to_file/merged_modified_per_base_calling.chr11.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr12.bgz", "/path_to_file/merged_modified_per_base_calling.chr13.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr14.bgz", "/path_to_file/merged_modified_per_base_calling.chr15.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr16.bgz", "/path_to_file/merged_modified_per_base_calling.chr17.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr18.bgz", "/path_to_file/merged_modified_per_base_calling.chr19.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr2.bgz", "/path_to_file/merged_modified_per_base_calling.chr3.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr4.bgz", "/path_to_file/merged_modified_per_base_calling.chr5.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr6.bgz", "/path_to_file/merged_modified_per_base_calling.chr7.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr8.bgz", "/path_to_file/merged_modified_per_base_calling.chr9.bgz", 
"/path_to_file/merged_modified_per_base_calling.chrm.bgz", "/path_to_file/merged_modified_per_base_calling.chrX.bgz", 
"/path_to_file/merged_modified_per_base_calling.chrY.bgz")

CodePudding user response:

lapply(sort(chrm_to_filter$chr), 
      function(chr) {
          path_per_chr_tabix[grep(paste0(chr,"."),
                     path_per_chr_tabix, 
                     fixed = TRUE)]
       })|> 
      unlist()

#[1] "/path_to_file/merged_modified_per_base_calling.chr1.bgz" 
#[2] "/path_to_file/merged_modified_per_base_calling.chr11.bgz"
#[3] "/path_to_file/merged_modified_per_base_calling.chr16.bgz"
#[4] "/path_to_file/merged_modified_per_base_calling.chr17.bgz"
#[5] "/path_to_file/merged_modified_per_base_calling.chr2.bgz" 
#[6] "/path_to_file/merged_modified_per_base_calling.chr5.bgz" 
#[7] "/path_to_file/merged_modified_per_base_calling.chr6.bgz" 
#[8] "/path_to_file/merged_modified_per_base_calling.chr7.bgz" 

  • Related