Create set from python loop output and remove duplicates

149 Views Asked by At

I've researched the options already available on stack overflow, but none have helped given my lack of understanding of python. I have the following code, which gives the below output.

I would like to understand how to remove the duplicate lines from the output itself and save these to a file.

code:

product = contig = id = label = start = end = description = db_ref = feature_strand = cds_start = cds_end = 'NA'
with open(output.gff, "w") as ofile, open(input.gbk, "r+", encoding="utf-8") as gbkfile:
    for seq_record in SeqIO.parse(gbkfile, "genbank"):
        for feature in seq_record.features:
            if feature.type=='region':
                product=feature.qualifiers['product']
            if feature.type=='PFAM_domain':
                contig=seq_record.description
                id=seq_record.id
                label=feature.qualifiers['label']
                start=int(feature.location.start)+1
                end=int(feature.location.end)
                description=feature.qualifiers['description']
                db_ref=feature.qualifiers['db_xref']
            ofile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id))

output:

GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']

Expected output:

GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_3'] 2215 2487 ['terpene'] ['Glycosyl transferase family 2'] ['PF00535.26']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_1'] 58 705 ['terpene'] ['Lycopene cyclase protein'] ['PF05834.12']
GL_R68_GL53_UP_2_contig_68150 c00202_GL_R68_.. ['ctg202_2'] 1054 1503 ['terpene'] ['Beta-lactamase'] ['PF00144.24']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_1'] 218 1225 ['terpene'] ['Flavin containing amine oxidoreductase'] ['PF01593.24', 'GO:0016491', 'GO:0055114']
GL_R68_GL53_UP_2_contig_311481 c00750_GL_R68_.. ['ctg750_2'] 1346 2065 ['terpene'] ['Squalene/phytoene synthase'] ['PF00494.19']

Thank you for your help!

2

There are 2 best solutions below

7
Alexander On BEST ANSWER

You can collect lines that you have already written to a set, and then check that same set on each iteration to make sure you are not writing a duplicate line. This method is likely to require the least amount of changes to your code.

product = contig = id = label = start = end = description = db_ref = feature_strand = cds_start = cds_end = 'NA'
written = set()             # create an empty set
with open('output.gff', "w") as ofile, open('input.gbk', "r+", encoding="utf-8") as gbkfile:
    for seq_record in SeqIO.parse(gbkfile, "genbank"):
        for feature in seq_record.features:
            if feature.type=='region':
                product=feature.qualifiers['product']
            if feature.type=='PFAM_domain':
                contig=seq_record.description
                id=seq_record.id
                label=feature.qualifiers['label']
                start=int(feature.location.start)+1
                end=int(feature.location.end)
                description=feature.qualifiers['description']
                db_ref=feature.qualifiers['db_xref']
            output = "{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id)
            if output not in written:   # if output not in set
                written.add(output)      # add the output to the set
                ofile.write(output)     # then write the line
4
Dmitriy Neledva On

I've commented all alterations in the code

product = contig = id = label = start = end = description = db_ref = feature_strand = cds_start = cds_end = 'NA'
with open(output.gff, "w") as ofile, open(input.gbk, "r+", encoding="utf-8") as gbkfile:
    iterable_to_write = []  #this list will be written into 'ofile'
    for seq_record in SeqIO.parse(gbkfile, "genbank"):
        for feature in seq_record.features:
            if feature.type=='region':
                product=feature.qualifiers['product']
            if feature.type=='PFAM_domain':
                contig=seq_record.description
                id=seq_record.id
                label=feature.qualifiers['label']
                start=int(feature.location.start)+1
                end=int(feature.location.end)
                description=feature.qualifiers['description']
                db_ref=feature.qualifiers['db_xref']
            # here new items will be added to the list
    #         iterable_to_write.append("{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id))

                thing_to_add = "{}\t{}\t{}\t{}\t{}\t{}\t{}\tID={}\n".format(contig, label, start, end, product, description, db_ref, id)

                #if order is important
                if thing_to_add not in iterable_to_write:
                    iterable_to_write.append(thing_to_add)

    
    # iterable_to_write=set(iterable_to_write) #getting rid of dublicates

    #final writing 
    ofile.writelines(iterable_to_write) #it's iterable so there should be 'writelines()' but not 'write()'