I need help to create a program that calculate molecular weight of each protein sequence in FASTA fi-CodePudding

how to make a loop to connect dictionary of counter to aa dictionary

from Bio import SeqIO
from collections import Counter 
import pandas as pd
df=pd.read_csv("H:\\proteins_15_22535.csv")

for sequence in SeqIO.parse("H:\\yeast.fasta","fasta"):
    ID=sequence.id
    protein_seq=sequence.seq
    pcnt=Counter(sequence.seq)
    A=[]
    counter=[]
    for sequence.id in SeqIO.parse("H:\\yeast.fasta","fasta"):
        A.append(sequence.id)
        counter.append(pcnt)

CodePudding user response：

ok short example, you should be able to grasp how to calculate Mw for your sequences from here, remember analysed_seq = ProteinAnalysis(str(sequence.seq)) [lost a couple of minutes here] :

from Bio import SeqIO
#from Bio.SeqRecord import SeqRecord # not needed

from Bio.SeqUtils.ProtParam import ProteinAnalysis 

from io import StringIO



sequenz = ">id_1\nPPPGGGTTTWWWQQNN\n>id_2\nSSSSCCCCSSSPPPMMMHHH"

print(sequenz,'\n\n')

seq_stringio = StringIO()

seq_stringio.write(sequenz)

seq_stringio.seek(0)



for sequence in SeqIO.parse(seq_stringio    ,"fasta"):
    
    analysed_seq = ProteinAnalysis(str(sequence.seq))
    
    print(sequence.id, sequence.seq, 'Mw : ', analysed_seq.molecular_weight())

output:

>id_1
PPPGGGTTTWWWQQNN
>id_2
SSSSCCCCSSSPPPMMMHHH 


id_1 PPPGGGTTTWWWQQNN Mw :  1826.9198000000001
id_2 SSSSCCCCSSSPPPMMMHHH Mw :  2136.479499999999

see examples and explanation about Biopython ProtParam module. here https://biopython.org/wiki/ProtParam

CodePudding user response：

ok tried using Counter(), code :

from Bio import SeqIO
# from Bio.SeqRecord import SeqRecord #not necessary

from io import StringIO

from collections import Counter

weights = {'A': 71.04, 'C': 103.01, 'D': 115.03, 'E': 129.04, 'F': 147.07,
       'G': 57.02, 'H': 137.06, 'I': 113.08, 'K': 128.09, 'L': 113.08,
       'M': 131.04, 'N': 114.04, 'P': 97.05, 'Q': 128.06, 'R': 156.10,
       'S': 87.03, 'T': 101.05, 'V': 99.07, 'W': 186.08, 'Y': 163.06 }


sequenz = ">id_1\nPPPGGGTTTWWWQQNN\n>id_2\nSSSSCCCCSSSPPPMMMHHH"

print(sequenz,'\n\n')

seq_stringio = StringIO()

seq_stringio.write(sequenz)

seq_stringio.seek(0)


dictio = dict()

for sequence in SeqIO.parse(seq_stringio    ,"fasta"):
        
    dictio[sequence.id] =  Counter([i for i in sequence.seq])
    

print(dictio)


for i in dictio.keys():
    
    mw = 0
    
    for y,z in dictio[i].items():
          
        # print(y)
        mw  = ((weights[y]*z))
        
    print(i , mw)

output:

PPPGGGTTTWWWQQNN
>id_2
SSSSCCCCSSSPPPMMMHHH 


{'id_1': Counter({'P': 3, 'G': 3, 'T': 3, 'W': 3, 'Q': 2, 'N': 2}), 'id_2': Counter({'S': 7, 'C': 4, 'P': 3, 'M': 3, 'H': 3})}
id_1 1807.7999999999997
id_2 2116.7

same result using from The Sum of molecular weight of proteins in python:

from Bio import SeqIO
# from Bio.SeqRecord import SeqRecord #not necessary


from io import StringIO


weights = {'A': 71.04, 'C': 103.01, 'D': 115.03, 'E': 129.04, 'F': 147.07,
       'G': 57.02, 'H': 137.06, 'I': 113.08, 'K': 128.09, 'L': 113.08,
       'M': 131.04, 'N': 114.04, 'P': 97.05, 'Q': 128.06, 'R': 156.10,
       'S': 87.03, 'T': 101.05, 'V': 99.07, 'W': 186.08, 'Y': 163.06 }


sequenz = ">id_1\nPPPGGGTTTWWWQQNN\n>id_2\nSSSSCCCCSSSPPPMMMHHH"

print(sequenz,'\n\n')

seq_stringio = StringIO()

seq_stringio.write(sequenz)

seq_stringio.seek(0)

  
for sequence in SeqIO.parse(seq_stringio    ,"fasta"):
    
    weight = sum(weights[p] for p in sequence.seq)
    
    print(sequence.id, sequence.seq, 'Mw : ', weight )

output :

>id_1
PPPGGGTTTWWWQQNN
>id_2
SSSSCCCCSSSPPPMMMHHH 


id_1 PPPGGGTTTWWWQQNN Mw :  1807.7999999999995
id_2 SSSSCCCCSSSPPPMMMHHH Mw :  2116.7