Set the working directory

Question

Search and return transcript types from gene symbols by querying mygene.info or ensembl rest APIs

0

Entering edit mode

8 months ago

m.musgrove • 0

Hello! Per the title, I'm trying to determine the transcript type of each gene across 15 different .csv files. I've been messing around with both chatgpt and copilot due to a lack of personal experience, and as my coder friends are super busy.

So far, I've gotten it to the point where I can query both databases (separate codes, tried successively), but keep running into issues. I've posted the ensembl code below, could anyone help me troubleshoot please? Using VScode & autopep8 at the moment. The code seems to have pasted very weirdly (from VScode), so apologies for that!!

Alternatively, if anyone has a code that works for either API that I can then modify to search for gene symbols from my .csv files?

Issues I'm encountering:

Gene not found in database, despite the entry being there when I manually navigate to the website (previous version of the code, denoted as #17 in case I forget).
Current code keeps hanging up on the reiterative request for the first gene. Generates the corresponding files, but they all have the same, much truncated list of genes (around 10 from over 1,000). None of these genes are the first gene queried.

Stuff I've tried:

Switching to a different API (mygene to ensembl).
Changing the size of the call to 100 entries (mygene) down to 1 (ensembl).
Manually rewriting the code in both Notepad and VSCode, and formatting it using autopep8.
Trying reiterative and non-reiterative approaches to requesting the database (10, 5, 1 request per gene before moving to the next).

Code:

import os

import pandas as pd import requests import json import time

Set the working directory

os.chdir(r"C:\Users\maret\OneDrive - The University of Queensland\PhD QBI\Vaultrc5\Data\Omic Sequencing\Illumina NovaSeq\Results\Galaxy imports and results\IDs annotated")

Verify the current working directory

print("Current working directory:", os.getcwd())

Column name in the CSV that contains gene IDs

gene_id_column = "gene_symbol"

Output folder for results

output_folder = os.path.join(os.getcwd(), "output_results") os.makedirs(output_folder, exist_ok=True)

Log file for API responses

log_file = os.path.join(output_folder, "api_responses.log")

Function to query Ensembl REST API for gene information with retry mechanism

def query_ensembl(gene_ids): url = "https://useast.ensembl.org/lookup/symbol/mus_musculus/" results = [] headers = { 'Content-Type': 'application/json' }

for gene_id in gene_ids:
    retries = 5
    for attempt in range(retries):
        try:
            # Ensure the gene_id is correctly formatted
            gene_id = gene_id.strip()
            full_url = f"{url}{gene_id}?expand=1"
            print(f"Querying URL: {full_url}")
            response = requests.get(full_url, headers=headers)
            if response.status_code == 404:
                print(f"Gene ID {gene_id} not found. URL: {full_url}")
                with open(log_file, "a") as log:
                    log.write(
                        f"Gene ID {gene_id} not found. URL: {full_url}\n")
                break
            response.raise_for_status()
            print(f"API response status: {response.status_code}")
            result = response.json()
            print(f"Result for ID {gene_id}: {result}")
            results.append(result)

            # Log the full API response
            with open(log_file, "a") as log:
                log.write(f"Gene ID: {gene_id}\n")
                log.write(json.dumps(result, indent=4) + "\n")
            break  # Exit the retry loop if the request is successful

        except requests.exceptions.RequestException as e:
            print(f"Error querying Ensembl for {gene_id}: {e}")
            with open(log_file, "a") as log:
                log.write(f"Error querying Ensembl for {gene_id}: {e}\n")
            if attempt < retries - 1:
                wait_time = 1  # Fixed wait time of 1 second
                print(f"Retrying in {
                      wait_time} seconds... (Attempt {attempt + 1}/{retries})")
                time.sleep(wait_time)
            else:
                print(f"Failed to query Ensembl for {
                      gene_id} after {retries} attempts.")
                with open(log_file, "a") as log:
                    log.write(f"Failed to query Ensembl for {
                              gene_id} after {retries} attempts.\n")

return results

Process each CSV file in the folder

for file_name in os.listdir(os.getcwd()): if file_name.endswith(".csv"): file_path = os.path.join(os.getcwd(), file_name) print(f"Processing file: {file_name}") try:

        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path, dtype=str)
        print(f"DataFrame columns: {df.columns}")

        # Check if the gene ID column exists
        if gene_id_column in df.columns:
            # Extract unique gene IDs
            gene_ids = df[gene_id_column].dropna().unique().tolist()
            print(f"Gene IDs: {gene_ids}")

            # Query Ensembl
            results = query_ensembl(gene_ids)
            print(f"Query results: {results}")

            # Prepare the results for saving
            parsed_results = []
            for result in results:
                parsed_results.append({
                    "id": result.get("id", "Unknown ID"),
                    "display_name": result.get("display_name", "Unknown name"),
                    "biotype": result.get("biotype", "Unknown type")
                })
            print(f"Parsed results: {parsed_results}")

            # Save the results to a new CSV file
            output_file = os.path.join(
                output_folder, f"{os.path.splitext(
                    file_name)[0]}_ensembl_results.csv"
            )
            results_df = pd.DataFrame(parsed_results)
            results_df.to_csv(output_file, index=False)
            print(f"Processed {file_name}, results saved to {output_file}")
        else:
            print(f"Column '{gene_id_column}' not found in {file_name}")
    except Exception as e:
        print(f"Error processing {file_name} ({file_path}): {e}")

print("Processing complete!")

Gene Ensembl MyGene Transcriptomics Identification • 367 views

ADD COMMENT • link 8 months ago by m.musgrove • 0