how to make a loop to connect dictionary of counter to aa dictionary
from Bio import SeqIO
from collections import Counter
import pandas as pd
df=pd.read_csv("H:\\proteins_15_22535.csv")
for sequence in SeqIO.parse("H:\\yeast.fasta","fasta"):
ID=sequence.id
protein_seq=sequence.seq
pcnt=Counter(sequence.seq)
A=[]
counter=[]
for sequence.id in SeqIO.parse("H:\\yeast.fasta","fasta"):
A.append(sequence.id)
counter.append(pcnt)
CodePudding user response:
ok short example, you should be able to grasp how to calculate Mw for your sequences from here, remember analysed_seq = ProteinAnalysis(str(sequence.seq))
[lost a couple of minutes here] :
from Bio import SeqIO
#from Bio.SeqRecord import SeqRecord # not needed
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from io import StringIO
sequenz = ">id_1\nPPPGGGTTTWWWQQNN\n>id_2\nSSSSCCCCSSSPPPMMMHHH"
print(sequenz,'\n\n')
seq_stringio = StringIO()
seq_stringio.write(sequenz)
seq_stringio.seek(0)
for sequence in SeqIO.parse(seq_stringio ,"fasta"):
analysed_seq = ProteinAnalysis(str(sequence.seq))
print(sequence.id, sequence.seq, 'Mw : ', analysed_seq.molecular_weight())
output:
>id_1
PPPGGGTTTWWWQQNN
>id_2
SSSSCCCCSSSPPPMMMHHH
id_1 PPPGGGTTTWWWQQNN Mw : 1826.9198000000001
id_2 SSSSCCCCSSSPPPMMMHHH Mw : 2136.479499999999
see examples and explanation about Biopython ProtParam module.
here https://biopython.org/wiki/ProtParam
CodePudding user response:
ok tried using Counter(), code :
from Bio import SeqIO
# from Bio.SeqRecord import SeqRecord #not necessary
from io import StringIO
from collections import Counter
weights = {'A': 71.04, 'C': 103.01, 'D': 115.03, 'E': 129.04, 'F': 147.07,
'G': 57.02, 'H': 137.06, 'I': 113.08, 'K': 128.09, 'L': 113.08,
'M': 131.04, 'N': 114.04, 'P': 97.05, 'Q': 128.06, 'R': 156.10,
'S': 87.03, 'T': 101.05, 'V': 99.07, 'W': 186.08, 'Y': 163.06 }
sequenz = ">id_1\nPPPGGGTTTWWWQQNN\n>id_2\nSSSSCCCCSSSPPPMMMHHH"
print(sequenz,'\n\n')
seq_stringio = StringIO()
seq_stringio.write(sequenz)
seq_stringio.seek(0)
dictio = dict()
for sequence in SeqIO.parse(seq_stringio ,"fasta"):
dictio[sequence.id] = Counter([i for i in sequence.seq])
print(dictio)
for i in dictio.keys():
mw = 0
for y,z in dictio[i].items():
# print(y)
mw = ((weights[y]*z))
print(i , mw)
output:
PPPGGGTTTWWWQQNN
>id_2
SSSSCCCCSSSPPPMMMHHH
{'id_1': Counter({'P': 3, 'G': 3, 'T': 3, 'W': 3, 'Q': 2, 'N': 2}), 'id_2': Counter({'S': 7, 'C': 4, 'P': 3, 'M': 3, 'H': 3})}
id_1 1807.7999999999997
id_2 2116.7
same result using from The Sum of molecular weight of proteins in python:
from Bio import SeqIO
# from Bio.SeqRecord import SeqRecord #not necessary
from io import StringIO
weights = {'A': 71.04, 'C': 103.01, 'D': 115.03, 'E': 129.04, 'F': 147.07,
'G': 57.02, 'H': 137.06, 'I': 113.08, 'K': 128.09, 'L': 113.08,
'M': 131.04, 'N': 114.04, 'P': 97.05, 'Q': 128.06, 'R': 156.10,
'S': 87.03, 'T': 101.05, 'V': 99.07, 'W': 186.08, 'Y': 163.06 }
sequenz = ">id_1\nPPPGGGTTTWWWQQNN\n>id_2\nSSSSCCCCSSSPPPMMMHHH"
print(sequenz,'\n\n')
seq_stringio = StringIO()
seq_stringio.write(sequenz)
seq_stringio.seek(0)
for sequence in SeqIO.parse(seq_stringio ,"fasta"):
weight = sum(weights[p] for p in sequence.seq)
print(sequence.id, sequence.seq, 'Mw : ', weight )
output :
>id_1
PPPGGGTTTWWWQQNN
>id_2
SSSSCCCCSSSPPPMMMHHH
id_1 PPPGGGTTTWWWQQNN Mw : 1807.7999999999995
id_2 SSSSCCCCSSSPPPMMMHHH Mw : 2116.7