Hello! Per the title, I'm trying to determine the transcript type of each gene across 15 different .csv files. I've been messing around with both chatgpt and copilot due to a lack of personal experience, and as my coder friends are super busy.
So far, I've gotten it to the point where I can query both databases (separate codes, tried successively), but keep running into issues. I've posted the ensembl code below, could anyone help me troubleshoot please? Using VScode & autopep8 at the moment. The code seems to have pasted very weirdly (from VScode), so apologies for that!!
Alternatively, if anyone has a code that works for either API that I can then modify to search for gene symbols from my .csv files?
Issues I'm encountering:
- Gene not found in database, despite the entry being there when I manually navigate to the website (previous version of the code, denoted as #17 in case I forget).
- Current code keeps hanging up on the reiterative request for the first gene. Generates the corresponding files, but they all have the same, much truncated list of genes (around 10 from over 1,000). None of these genes are the first gene queried.
Stuff I've tried:
- Switching to a different API (mygene to ensembl).
- Changing the size of the call to 100 entries (mygene) down to 1 (ensembl).
- Manually rewriting the code in both Notepad and VSCode, and formatting it using autopep8.
- Trying reiterative and non-reiterative approaches to requesting the database (10, 5, 1 request per gene before moving to the next).
Code:
import os
import pandas as pd import requests import json import time
Set the working directory
os.chdir(r"C:\Users\maret\OneDrive - The University of Queensland\PhD QBI\Vaultrc5\Data\Omic Sequencing\Illumina NovaSeq\Results\Galaxy imports and results\IDs annotated")
Verify the current working directory
print("Current working directory:", os.getcwd())
Column name in the CSV that contains gene IDs
gene_id_column = "gene_symbol"
Output folder for results
output_folder = os.path.join(os.getcwd(), "output_results") os.makedirs(output_folder, exist_ok=True)
Log file for API responses
log_file = os.path.join(output_folder, "api_responses.log")
Function to query Ensembl REST API for gene information with retry mechanism
def query_ensembl(gene_ids): url = "https://useast.ensembl.org/lookup/symbol/mus_musculus/" results = [] headers = { 'Content-Type': 'application/json' }
for gene_id in gene_ids:
retries = 5
for attempt in range(retries):
try:
# Ensure the gene_id is correctly formatted
gene_id = gene_id.strip()
full_url = f"{url}{gene_id}?expand=1"
print(f"Querying URL: {full_url}")
response = requests.get(full_url, headers=headers)
if response.status_code == 404:
print(f"Gene ID {gene_id} not found. URL: {full_url}")
with open(log_file, "a") as log:
log.write(
f"Gene ID {gene_id} not found. URL: {full_url}\n")
break
response.raise_for_status()
print(f"API response status: {response.status_code}")
result = response.json()
print(f"Result for ID {gene_id}: {result}")
results.append(result)
# Log the full API response
with open(log_file, "a") as log:
log.write(f"Gene ID: {gene_id}\n")
log.write(json.dumps(result, indent=4) + "\n")
break # Exit the retry loop if the request is successful
except requests.exceptions.RequestException as e:
print(f"Error querying Ensembl for {gene_id}: {e}")
with open(log_file, "a") as log:
log.write(f"Error querying Ensembl for {gene_id}: {e}\n")
if attempt < retries - 1:
wait_time = 1 # Fixed wait time of 1 second
print(f"Retrying in {
wait_time} seconds... (Attempt {attempt + 1}/{retries})")
time.sleep(wait_time)
else:
print(f"Failed to query Ensembl for {
gene_id} after {retries} attempts.")
with open(log_file, "a") as log:
log.write(f"Failed to query Ensembl for {
gene_id} after {retries} attempts.\n")
return results
Process each CSV file in the folder
for file_name in os.listdir(os.getcwd()): if file_name.endswith(".csv"): file_path = os.path.join(os.getcwd(), file_name) print(f"Processing file: {file_name}") try:
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, dtype=str)
print(f"DataFrame columns: {df.columns}")
# Check if the gene ID column exists
if gene_id_column in df.columns:
# Extract unique gene IDs
gene_ids = df[gene_id_column].dropna().unique().tolist()
print(f"Gene IDs: {gene_ids}")
# Query Ensembl
results = query_ensembl(gene_ids)
print(f"Query results: {results}")
# Prepare the results for saving
parsed_results = []
for result in results:
parsed_results.append({
"id": result.get("id", "Unknown ID"),
"display_name": result.get("display_name", "Unknown name"),
"biotype": result.get("biotype", "Unknown type")
})
print(f"Parsed results: {parsed_results}")
# Save the results to a new CSV file
output_file = os.path.join(
output_folder, f"{os.path.splitext(
file_name)[0]}_ensembl_results.csv"
)
results_df = pd.DataFrame(parsed_results)
results_df.to_csv(output_file, index=False)
print(f"Processed {file_name}, results saved to {output_file}")
else:
print(f"Column '{gene_id_column}' not found in {file_name}")
except Exception as e:
print(f"Error processing {file_name} ({file_path}): {e}")
print("Processing complete!")