Entering edit mode
4 weeks ago
lily
•
0
Hello please could anyone help me I'm trying to use accession numbers that I have in a table like around 3000 named like GCF_036600915.1, GCF_900128725.1. To obtain the cds from NCBI. I am getting errors like no records found or could not access ncbi and match. I'm new to code so am struggling so an help would be appreciated thanks.
Here's my code:
import time
import pandas as pd
from Bio import Entrez, SeqIO
# Set email address
Entrez.email = "email"
# Read the CSV file containing assembly accession numbers
csv_file = "fully_cleaned.csv"
data = pd.read_csv(csv_file)
accession_numbers = data["assembly_accession"]
# Function to fetch nucleotide accession number from assembly accession number
def fetch_nucleotide_accession(assembly_accession):
try:
# Use Entrez to fetch the assembly details
handle = Entrez.efetch(db="assembly", id=assembly_accession, rettype="xml", retmode="text")
records = SeqIO.read(handle, "genbank")
handle.close()
# Extract nucleotide accession number
nucleotide_accessions = []
for record in records['ASSEMBLY']['ASSEMBLY_ANNOTATION']['ASSEMBLY_ACCESSION']:
nucleotide_accessions.append(record)
return nucleotide_accessions
except Exception as e:
print(f"Error occurred while fetching nucleotide accession for {assembly_accession}: {e}")
return []
# Function to fetch CDS from NCBI using nucleotide accession numbers
def fetch_cds(nucleotide_accessions):
for accession in nucleotide_accessions:
try:
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
record = SeqIO.read(handle, "genbank")
handle.close()
# Define the output file name for this accession's CDS sequences
output_file = f"{accession}_cds.fasta"
# Write each CDS sequence to its own file
with open(output_file, "w") as output_handle:
for i, feature in enumerate(record.features):
if feature.type == "CDS":
cds_seq = feature.extract(record.seq)
SeqIO.write(
SeqIO.SeqRecord(cds_seq, id=f"{accession}_CDS{i + 1}", description="CDS"),
output_handle,
"fasta"
)
print(f"CDS for {accession} is saved to {output_file}")
except Exception as e:
print(f"Error occurred while fetching CDS for {accession}: {e}")
# Loop through each assembly accession number and fetch its corresponding nucleotide accessions and CDS
for acc in accession_numbers:
nucleotide_accessions = fetch_nucleotide_accession(acc)
if nucleotide_accessions:
fetch_cds(nucleotide_accessions)
time.sleep(1)
print("CDS fetching and file creation completed")
It may be simpler to download the CDS file after getting the location of the assembly you are looking at. One way to do this would be to use
EntrezDirect
(LINK).(You will need to replace
ftp
withhttps
).That directory contains the file you need: https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/036/600/915/GCF_036600915.1_ASM3660091v1/GCF_036600915.1_ASM3660091v1_cds_from_genomic.fna.gz
You should be able to reconstruct that URL programmatically.