I've essentially created a function which inputs a string, and then adds the integers which appear before each unique letter. (e.g. 21M4D35M, would provide 56M and 4D).
import re
import itertools
# example input: X is "21M4D35M1I84M9S15=92X"
def CIGAR(X):
m = list(map(int, (re.findall(r"(\d )M", X)))) #need to create a loop for these.
d = list(map(int, (re.findall(r"(\d )D", X))))
n = list(map(int, (re.findall(r"(\d )N", X))))
i = list(map(int, (re.findall(r"(\d )I", X))))
s = list(map(int, (re.findall(r"(\d )S", X))))
h = list(map(int, (re.findall(r"(\d )H", X))))
x = list(map(int, (re.findall(r"(\d )X", X))))
equals = list(map(int, (re.findall(r"(\d )=", X))))
query = itertools.chain(m,n,d,i,s,h,x,equals)
reference = itertools.chain(m,n,d,i,s,h,x,equals)
query = m i s equals x #according to samtools, these operations cause the alignment to step along the query sequence
reference = m d n equals x
print(sum(m), "Exact Matches\n",
sum(d), "Deletions\n",
sum(n), "Skipped region from the reference\n",
sum(i), "Insertions\n",
sum(s), "Soft Clippings\n",
sum(h), "Hard Clippings\n",
sum(x), "sequence match\n",
sum(equals), "sequence mismatch\n",
sum(query), "bases in query sequence\n",
sum(reference), "bases in the reference sequence")
However, I'm trying to implement a more efficient way to perform the first part of the function (lines 7-14). I'm trying to do something like, but don't know exactly how to implement this
acronyms = [m,d,i,s,h,x,"="]
for i in acronyms:
i = list(map(int, (re.findall(r"(\d )capitalize[i]", X))))
As Always, any help is greatly appreciated and if something isn't clear let me know!
CodePudding user response:
An efficient solution would be to used collections.Counter
:
def CIGAR(cigar_string):
from collections import Counter
c = Counter()
for n, k in re.findall('(\d )(\D)', cigar_string):
c[k] = int(n)
keys = {'M': 'Exact Matches',
'D': 'Deletions',
'N': 'Skipped region from the reference',
'I': 'Insertions',
'S': 'Soft Clippings',
'H': 'Hard Clippings',
'X': 'sequence match',
'=': 'sequence mismatch',
}
for k in keys:
print(f'{c[k]:>5}: {keys[k]}')
query = sum(c[x] for x in 'MIS=X')
ref = sum(c[x] for x in 'MDN=X')
print(f'{query:>5}: bases in query sequence')
print(f'{ref:>5}: bases in reference sequence')
return c
Example:
>>> CIGAR('21M4D35M')
56: Exact Matches
4: Deletions
0: Skipped region from the reference
0: Insertions
0: Soft Clippings
0: Hard Clippings
0: sequence match
0: sequence mismatch
56: bases in query sequence
60: bases in reference sequence
Counter({'M': 56, 'D': 4})