Both degeneracy1 and protein_ls are not being reassigned in the nested while loops I am using, I can't figure out why this. This program is designed to find the best protein motif to create an oligo for genetic engineering. Both degeneracy1 and protein_ls are listed near the bottom of the python code.
import itertools
from collections import Counter
degen = {
"A": 4,"R": 6,"N": 2,"D": 2,"C": 2,
"E": 2,"Q": 2,"G": 2,"H": 2,"I": 3,
"L": 6,"K": 2,"M": 1,"F": 2,"P": 4,
"S": 6,"T": 4,"W": 1, "Y": 2,
"V": 4}
d= {
'A': ['GCA', 'GCC', 'GCG', 'GCT'],
'C': ['TGC', 'TGT'],
'D': ['GAC', 'GAT'],
'E': ['GAA', 'GAG'],
'F': ['TTC', 'TTT'],
'G': ['GGA', 'GGC', 'GGG', 'GGT'],
'H': ['CAC', 'CAT'],
'I': ['ATA', 'ATC', 'ATT'],
'K': ['AAA', 'AAG'],
'L': ['CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG'],
'M': ['ATG'],
'N': ['AAC', 'AAT'],
'P': ['CCA', 'CCC', 'CCG', 'CCT'],
'Q': ['CAA', 'CAG'],
'R': ['AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGT'],
'S': ['AGC', 'AGT', 'TCA', 'TCC', 'TCG', 'TCT'],
'T': ['ACA', 'ACC', 'ACG', 'ACT'],
'V': ['GTA', 'GTC', 'GTG', 'GTT'],
'W': ['TGG'],
'Y': ['TAC', 'TAT'],
'_': ['TAA', 'TAG', 'TGA'],
}
def generator(protein):
l = [d[aa] for aa in protein]
for comb in itertools.product(*l):
yield "".join(comb)
if __name__ == '__main__':
import sys
sequence= 'MQLRNPELHLGCALALRFLALVSWDIPGARALDNGLARTPTMGWLHWERFMCNLDCQEEPDSCISEKLFMEMAELMVSEGWKDAGYEYLCIDDCWMAPQRDSEGRLQADPQRFPHGIRQLANYVHSKGLKLGIYADVGNKTCAGFPGSFGYYDIDAQTFADWGVDLLKFDGCYCDSLENLADGYKHMSLALNRTGRSIVYSCEWPLYMWPFQKPNYTEIRQYCNHWRNFADIDDSWKSIKSILDWTSFNQERIVDVAGPGGWNDPDMLVIGNFGLSWNQQVTQMALWAIMAAPLFMSNDLRHISPQAKALLQDKDVIAINQDPLGKQGYQLRQGDNFEVWERPLSGLAWAVAMINRQEIGGPRSYTIAVASLGKGVACNPACFITQLLPVKRKLGFYEWTSRLRSHINPTGTVLLQLENTMQMSLKDLL'
GCminbound= 35
GCmaxbound= 65
print('best sequences are as follows')
ls_seq=list(sequence)
degeneracy1 = 10000000000
degeneracy = 0
i=0
while i<len(ls_seq):
degeneracy=0
j=0
ls1_seq=ls_seq[i:i 7]
if len(ls1_seq)==7:
g = generator(ls1_seq)
GCmax=0
GCmin=100
for rna_seq in g:
seq_list = list(rna_seq)
GCcount=100*(seq_list.count('G') seq_list.count('C'))/len(seq_list)
if GCcount>GCmax:
GCmax=GCcount
elif GCmin>GCcount:
GCmin=GCcount
if GCmin>GCminbound and GCmax<GCmaxbound:
while j<7:
result=degen[ls1_seq[j]]
degeneracy=result degeneracy
j=j 1
if degeneracy<degeneracy1:
degeneracy1 = degeneracy
protein_ls=ls1_seq
i = i 1
print('best degeneracy is: ', degeneracy1)
print('the sequence is', ''.join(protein_ls))
CodePudding user response:
I did some refactoring. Can you try the following code?
import itertools
import re
degen = {"A": 4,"R": 6,"N": 2,"D": 2,"C": 2, "E": 2,"Q": 2,"G": 2,"H": 2,"I": 3, "L": 6,"K": 2,"M": 1,"F": 2,"P": 4, "S": 6,"T": 4,"W": 1, "Y": 2, "V": 4}
d= {'A': ['GCA', 'GCC', 'GCG', 'GCT'], 'C': ['TGC', 'TGT'], 'D': ['GAC', 'GAT'], 'E': ['GAA', 'GAG'], 'F': ['TTC', 'TTT'], 'G': ['GGA', 'GGC', 'GGG', 'GGT'], 'H': ['CAC', 'CAT'], 'I': ['ATA', 'ATC', 'ATT'], 'K': ['AAA', 'AAG'], 'L': ['CTA', 'CTC', 'CTG', 'CTT', 'TTA', 'TTG'], 'M': ['ATG'], 'N': ['AAC', 'AAT'], 'P': ['CCA', 'CCC', 'CCG', 'CCT'], 'Q': ['CAA', 'CAG'], 'R': ['AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGT'], 'S': ['AGC', 'AGT', 'TCA', 'TCC', 'TCG', 'TCT'], 'T': ['ACA', 'ACC', 'ACG', 'ACT'], 'V': ['GTA', 'GTC', 'GTG', 'GTT'], 'W': ['TGG'], 'Y': ['TAC', 'TAT'], '_': ['TAA', 'TAG', 'TGA'],}
def rna_seq_generator(protein):
for comb in itertools.product(*[d[s] for s in protein]):
yield "".join(comb)
if __name__ == '__main__':
print('best sequences are as follows')
sequence= 'MQLRNPELHLGCALALRFLALVSWDIPGARALDNGLARTPTMGWLHWERFMCNLDCQEEPDSCISEKLFMEMAELMVSEGWKDAGYEYLCIDDCWMAPQRDSEGRLQADPQRFPHGIRQLANYVHSKGLKLGIYADVGNKTCAGFPGSFGYYDIDAQTFADWGVDLLKFDGCYCDSLENLADGYKHMSLALNRTGRSIVYSCEWPLYMWPFQKPNYTEIRQYCNHWRNFADIDDSWKSIKSILDWTSFNQERIVDVAGPGGWNDPDMLVIGNFGLSWNQQVTQMALWAIMAAPLFMSNDLRHISPQAKALLQDKDVIAINQDPLGKQGYQLRQGDNFEVWERPLSGLAWAVAMINRQEIGGPRSYTIAVASLGKGVACNPACFITQLLPVKRKLGFYEWTSRLRSHINPTGTVLLQLENTMQMSLKDLL'
n_char_read = 7
GC_min_baound, GC_max_bound = 35, 65
degeneracy_best = 10000000000
seq_protein = ''
for i in range(len(sequence)):
seq_target=sequence[i:i n_char_read]
if len(seq_target)==n_char_read:
GC_min, GC_max = 100, 0
for rna_seq in rna_seq_generator(seq_target):
GC_occupancy = 100 * len(re.findall('G|C', rna_seq)) / len(rna_seq)
GC_min, GC_max = min(GC_min, GC_occupancy), max(GC_max, GC_occupancy)
if GC_min_baound < GC_min and GC_max < GC_max_bound:
degeneracy = sum([degen[c] for c in seq_target])
if degeneracy < degeneracy_best:
degeneracy_best = degeneracy
seq_protein = seq_target
print(f'best degeneracy is: {degeneracy_best}')
print(f'the sequence is: {seq_protein}')