How to translate multiple strings from the list with using dictionaries-CodePudding

I am trying to translate all the Open reading frames (ORFs) into amino acid sequences using aa_dict in python, and wondering how can I go over every string in my list and translate them. I appreciate your help, thank you!

long_ORFs list is something like this:

['ATGAGAGAG', 'AGAGAAGTCT', 'ACACTAGAGAGAGA',......]

and here is my code:

def translate(long_ORFs):


     aa_dict = {
    'AAA': 'K',  'AAC': 'N',  'AAG': 'K',  'AAT': 'N',  'ACA': 'T',  'ACC': 'T',  
    'ACG': 'T',  'ACT': 'T',  'AGA': 'R',  'AGC': 'S',  'AGG': 'R',  'AGT': 'S',  
    'ATA': 'I',  'ATC': 'I',  'ATG': 'M',  'ATT': 'I',  'CAA': 'Q',  'CAC': 'H',  
    'CAG': 'Q',  'CAT': 'H',  'CCA': 'P',  'CCC': 'P',  'CCG': 'P',  'CCT': 'P',  
    'CGA': 'R',  'CGC': 'R',  'CGG': 'R',  'CGT': 'R',  'CTA': 'L',  'CTC': 'L',  
    'CTG': 'L',  'CTT': 'L',  'GAA': 'E',  'GAC': 'D',  'GAG': 'E',  'GAT': 'D',  
    'GCA': 'A',  'GCC': 'A',  'GCG': 'A',  'GCT': 'A',  'GGA': 'G',  'GGC': 'G',  
    'GGG': 'G',  'GGT': 'G',  'GTA': 'V',  'GTC': 'V',  'GTG': 'V',  'GTT': 'V',  
    'TAA': '*',  'TAC': 'Y',  'TAG': '*',  'TAT': 'Y',  'TCA': 'S',  'TCC': 'S',  
    'TCG': 'S',  'TCT': 'S',  'TGA': '*',  'TGC': 'C',  'TGG': 'W',  'TGT': 'C',  
    'TTA': 'L',  'TTC': 'F',  'TTG': 'L',  'TTT': 'F'  
}
     aaList = [] # an empty list

     if len(long_ORFs) % 3 ==0:


         for i in range(0, len(long_ORFs), 3):

              codon = long_ORFs [ i : i 3]

              aaList  = aa_dict[codon]
              
              
     return ''.join(aaList)

translate(long_ORFs)

this resulted as an empty string

CodePudding user response：

It appears that rather than processing the contents of long_ORFs the way you are, you instead want to iterate over long_ORFs and process each item in that list individually. So I think you want something like this:

long_ORFs = ['ATGAGAGAG', 'AGAGAAGTCT', 'ACACTAGAGAGAGA']

def translate(long_ORFs):
    aa_dict = {
        'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', 'ACA': 'T', 'ACC': 'T',
        'ACG': 'T', 'ACT': 'T', 'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S',
        'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I', 'CAA': 'Q', 'CAC': 'H',
        'CAG': 'Q', 'CAT': 'H', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
        'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CTA': 'L', 'CTC': 'L',
        'CTG': 'L', 'CTT': 'L', 'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D',
        'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GGA': 'G', 'GGC': 'G',
        'GGG': 'G', 'GGT': 'G', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
        'TAA': '*', 'TAC': 'Y', 'TAG': '*', 'TAT': 'Y', 'TCA': 'S', 'TCC': 'S',
        'TCG': 'S', 'TCT': 'S', 'TGA': '*', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C',
        'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'
    }
    aaList = []  # an empty list

    for orf in long_ORFs:
        if len(orf) % 3 == 0:
            buf = ''
            for i in range(0, len(orf), 3):
                codon = orf[i: i   3]
                buf  = aa_dict[codon]
            aaList.append(buf)
    return aaList

print(translate(long_ORFs))

Result:

['MRE']

You are only getting a single entry in the resulting list because only one of the three input sequences in long_ORFs has a length that is an even multiple of three. Since you're testing for that, you only process and add that one entry to the final output. If you move that constraint into the inner loop and just ignore extra characters that aren't part of a group of 3 characters, like this:

for orf in long_ORFs:
    buf = ''
    for i in range(0, len(orf), 3):
        codon = orf[i: i   3]
        if len(codon) == 3:
            buf  = aa_dict[codon]
    aaList.append(buf)

Then you get the result:

['MRE', 'REV', 'TLER']

CodePudding user response：

Code:-

def translate(long_ORFs):
    aa_dict = {'AAA': 'K',  'AAC': 'N',  'AAG': 'K',  'AAT': 'N',  'ACA': 'T',  'ACC': 'T',  'ACG': 'T',  'ACT': 'T',  'AGA': 'R',  'AGC': 'S',  'AGG': 'R',  'AGT': 'S',  'ATA': 'I',  'ATC': 'I',  'ATG': 'M',  'ATT': 'I',  'CAA': 'Q',  'CAC': 'H',  'CAG': 'Q',  'CAT': 'H',  'CCA': 'P',  'CCC': 'P',  'CCG': 'P',  'CCT': 'P',  'CGA': 'R',  'CGC': 'R',  'CGG': 'R',  'CGT': 'R',  'CTA': 'L',  'CTC': 'L',  'CTG': 'L',  'CTT': 'L',  'GAA': 'E',  'GAC': 'D',  'GAG': 'E',  'GAT': 'D',  'GCA': 'A',  'GCC': 'A',  'GCG': 'A',  'GCT': 'A',  'GGA': 'G',  'GGC': 'G',  'GGG': 'G',  'GGT': 'G',  'GTA': 'V',  'GTC': 'V',  'GTG': 'V',  'GTT': 'V',  'TAA': '*',  'TAC': 'Y',  'TAG': '*',  'TAT': 'Y',  'TCA': 'S',  'TCC': 'S',  'TCG': 'S',  'TCT': 'S',  'TGA': '*',  'TGC': 'C',  'TGG': 'W',  'TGT': 'C',  'TTA': 'L',  'TTC': 'F',  'TTG': 'L',  'TTT': 'F'  }
    aaList = [] # an empty list
    for word in long_ORFs:
        if len(word)%3==0:
            temp=""
            for i in range(0, len(word), 3):
                codon=word[i:i 3]
                temp =aa_dict[codon]
            aaList.append(temp)      
        else:
            continue
    return aaList

print(translate(['ATGAGAGAG','AGAGAAGTC','ACACTAGAGAGAGA']))

Output:-

['MRE', 'REV']

These 2 outputs are from first 2 words there is no 3rd output because the 3rd word is not divisble by 3

CodePudding user response：

from itertools import zip_longest

aa_dict = {
    'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', 'ACA': 'T', 'ACC': 'T',
    'ACG': 'T', 'ACT': 'T', 'AGA': 'R', 'AGC': 'S', 'AGG': 'R', 'AGT': 'S',
    'ATA': 'I', 'ATC': 'I', 'ATG': 'M', 'ATT': 'I', 'CAA': 'Q', 'CAC': 'H',
    'CAG': 'Q', 'CAT': 'H', 'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
    'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CTA': 'L', 'CTC': 'L',
    'CTG': 'L', 'CTT': 'L', 'GAA': 'E', 'GAC': 'D', 'GAG': 'E', 'GAT': 'D',
    'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A', 'GGA': 'G', 'GGC': 'G',
    'GGG': 'G', 'GGT': 'G', 'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
    'TAA': '*', 'TAC': 'Y', 'TAG': '*', 'TAT': 'Y', 'TCA': 'S', 'TCC': 'S',
    'TCG': 'S', 'TCT': 'S', 'TGA': '*', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C',
    'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'
}


def translate(long_ORFs):
    aa_list = list()
    for orf in long_ORFs:
        seq = ''
        args = [iter(orf)] * 3
        for chunk in zip_longest(*args, fillvalue='_'):
            codon = ''.join(chunk)
            if codon in aa_dict:
                seq  = aa_dict[codon]
        if seq:
            aa_list.append(seq)
    return aa_list


print(translate(['ATGAGAGAG', 'AGAGAAGTCT', 'ACACTAGAGAGAGA']))

Output:

['MRE', 'REV', 'TLER']

The above translate() function uses zip_longest() in order to chunk every ORF in long_ORFs into a 3 character tuples, each of which joined into a string named codon, translated by the aa_dict dictionary and concatenated to the seq string to produce a sequence. Result is a list containing all sequences in each ORF, even if the ORF's length is not divisible by 3. If you wish to translate only those of length 3, you should add a check at the beginning of the first for loop.

The aa_dict is put outside of the function, as it's static and need not be defined every time the function is running (you may pass it as an argument to the function in order to make the function stateless).