Create set from python loop output and remove duplicates-CodePudding

I've researched the options already available on stack overflow, but none have helped given my lack of understanding of python. I have the following code, which gives the below output.

I would like to understand how to remove the duplicate lines from the output itself and save these to a file.

code:

product = contig = id = label = start = end = description = db_ref = feature_strand = cds_start = cds_end = 'NA'
with open(output.gff, "w") as ofile, open(input.gbk, "r ", encoding="utf-8") as gbkfile:
    for seq_record in SeqIO.parse(gbkfile, "genbank"):
        for feature in seq_record.features:
            if feature.type=='region':
                product=feature.qualifiers['product']
            if feature.type=='PFAM_domain':
                contig=seq_record.description
                id=seq_record.id
                label=feature.qualifiers['label']
                start=int(feature.location.start) 1
                end=int(feature.location.end)
                description=feature.qualifiers['description']
                db_ref=feature.qualifiers['db_xref']
            ofile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id))

output:

GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']

Expected output:

GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']

Thank you for your help!

CodePudding user response：

You can collect lines that you have already written to a set, and then check that same set on each iteration to make sure you are not writing a duplicate line. This method is likely to require the least amount of changes to your code.

product = contig = id = label = start = end = description = db_ref = feature_strand = cds_start = cds_end = 'NA'
written = set()             # create an empty set
with open('output.gff', "w") as ofile, open('input.gbk', "r ", encoding="utf-8") as gbkfile:
    for seq_record in SeqIO.parse(gbkfile, "genbank"):
        for feature in seq_record.features:
            if feature.type=='region':
                product=feature.qualifiers['product']
            if feature.type=='PFAM_domain':
                contig=seq_record.description
                id=seq_record.id
                label=feature.qualifiers['label']
                start=int(feature.location.start) 1
                end=int(feature.location.end)
                description=feature.qualifiers['description']
                db_ref=feature.qualifiers['db_xref']
            output = "{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id)
            if output not in written:   # if output not in set
                written.add(output)      # add the output to the set
                ofile.write(output)     # then write the line

CodePudding user response：

I've commented all alterations in the code

product = contig = id = label = start = end = description = db_ref = feature_strand = cds_start = cds_end = 'NA'
with open(output.gff, "w") as ofile, open(input.gbk, "r ", encoding="utf-8") as gbkfile:
    iterable_to_write = []  #this list will be written into 'ofile'
    for seq_record in SeqIO.parse(gbkfile, "genbank"):
        for feature in seq_record.features:
            if feature.type=='region':
                product=feature.qualifiers['product']
            if feature.type=='PFAM_domain':
                contig=seq_record.description
                id=seq_record.id
                label=feature.qualifiers['label']
                start=int(feature.location.start) 1
                end=int(feature.location.end)
                description=feature.qualifiers['description']
                db_ref=feature.qualifiers['db_xref']
            # here new items will be added to the list
    #         iterable_to_write.append("{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id))

                thing_to_add = "{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id)

                #if order is important
                if thing_to_add not in iterable_to_write:
                    iterable_to_write.append(thing_to_add)

    
    # iterable_to_write=set(iterable_to_write) #getting rid of dublicates

    #final writing 
    ofile.writelines(iterable_to_write) #it's iterable so there should be 'writelines()' but not 'write()'