Downloading genes or Genomes from NCBI

67 Views Asked by At

get gene ids

symbols = ['gyrB', 'atpD', 'lepA']
def get_gene_ids(symbols, taxon):
    GeneApi =gene_api.GeneApi()
    gene_metadata = GeneApi.gene_metadata_by_tax_and_symbol(symbols, taxon)
    gene_id_dict = {
    gene_data.gene.symbol: int(gene_data.gene.gene_id)
    for gene_data in gene_metadata.genes
}
    return gene_id_dict

I have written a function to get some of gene_ids of bacterial taxa, this function can't get these instance gene_ids for my requested bacterial species, while in NCBI database we can find them in most of bacterial taxa by web search tools.

for exapmle: taxon = Chryseobacterium indologenes, successfully done. taxon = Pseudomonas putida ,----> AtrributeError.

1

There are 1 best solutions below

0
Umar On

check the installation, updating the library, and setting an API key if required.

# Check Installation
pip show ncbi-datasets-pylib

# Install if Needed
pip install ncbi-datasets-pylib
from ncbi.datasets import GeneApi

and finally try this way

from Bio import Entrez

# Replace with your actual email and gene IDs
Entrez.email = "[email protected]"
gene_ids = [56897904, 56899873, 56901385]

def download_gene_sequences(gene_ids):
    try:
        gene_sequences = {}
        for gene_id in gene_ids:
            handle = Entrez.efetch(db="gene", id=gene_id, rettype="gb", retmode="text")
            record = handle.read()
            gene_sequences[gene_id] = record

        return gene_sequences
    except Exception as ex:
        print(f"Error in API call: {ex}")
        return {}

# Download gene sequences
gene_sequences = download_gene_sequences(gene_ids)

# Example usage
for gene_id, sequence in gene_sequences.items():
    print(f"Gene ID {gene_id} Sequence:\n{sequence}")

which give u output as

Gene ID 56897904 Sequence:

1. atpD
F0F1 ATP synthase subunit beta [Chryseobacterium indologenes]
Other Aliases: DY225_RS04435, NCTC10796_00901
Other Designations: F0F1 ATP synthase subunit beta
ID: 56897904


Gene ID 56899873 Sequence:

1. lepA
translation elongation factor 4 [Chryseobacterium indologenes]
Other Aliases: DY225_RS14410, NCTC10796_02952
Other Designations: translation elongation factor 4
ID: 56899873


Gene ID 56901385 Sequence:

1. gyrB
DNA topoisomerase (ATP-hydrolyzing) subunit B [Chryseobacterium indologenes]
Other Aliases: DY225_RS22075, NCTC10796_04522
Other Designations: DNA topoisomerase (ATP-hydrolyzing) subunit B
ID: 56901385

u can also try another way

from Bio import Entrez

# Set your email address
Entrez.email = "[email protected]"  # Replace with your actual email

# Define symbols and taxon
symbols = ['gyrB', 'atpD', 'lepA']
taxon = "taxon_id"  # Replace with the actual taxon ID

# Construct the E-utility query
query = f"gene_symbol in ({','.join(symbols)}) AND taxon_id:{taxon}"
handle = Entrez.esearch(db="gene", term=query)

# Retrieve the gene IDs
record = Entrez.read(handle)
gene_ids = record["IdList"]

# Process the results
gene_id_dict = {symbol: gene_id for symbol, gene_id in zip(symbols, gene_ids)}
print(gene_id_dict)

or

from ete3 import NCBITaxa
from Bio import Entrez

def get_gene_ids(symbols, taxon):
    ncbi = NCBITaxa()
    taxon_id = ncbi.get_name_translator([taxon])
    
    if not taxon_id:
        print(f"Taxon '{taxon}' not found.")
        return {}
    
    taxon_id = taxon_id[taxon][0]
    
    gene_id_dict = {}
    
    for symbol in symbols:
        try:
            handle = Entrez.esearch(db="gene", term=f"{symbol}[Gene Symbol] AND {taxon_id}[Organism]", retmax=1)
            record = Entrez.read(handle)
            gene_id = record["IdList"][0]
            gene_id_dict[symbol] = gene_id
        except (IndexError, KeyError):
            print(f"Gene ID not found for symbol '{symbol}' in taxon '{taxon}'.")
    
    return gene_id_dict

# Example usage
symbols = ['GyrB', 'AtpD', 'LepA']
taxon_chryseobacterium = 'Chryseobacterium indologenes'
taxon_pseudomonas = 'Pseudomonas putida'

gene_ids_chryseobacterium = get_gene_ids(symbols, taxon_chryseobacterium)
print(f"Gene IDs for {taxon_chryseobacterium}: {gene_ids_chryseobacterium}")

gene_ids_pseudomonas = get_gene_ids(symbols, taxon_pseudomonas)
print(f"Gene IDs for {taxon_pseudomonas}: {gene_ids_pseudomonas}")