Entering edit mode
                    18 months ago
        Adyasha
        
    
        •
    
    0
    Hello everyone,
I have annotation file like this
less -S Sars_cov_2.ASM985889v3.101.gtf | head -20
#!genome-build ASM985889v3
#!genome-version ASM985889v3
#!genome-date 2020-01
#!genome-build-accession NCBI:GCA_009858895.3
MN908947.3      ensembl gene    266     21555   .       +       .       gene_id "ENSSASG00005000002"; gene_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding";
MN908947.3      ensembl transcript      266     21555   .       +       .       gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3      ensembl exon    266     21555   .       +       .       gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSSASE00005000002"; exon_version "1";
MN908947.3      ensembl CDS     266     21552   .       +       0       gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSSASP00005000002"; protein_version "1";
MN908947.3      ensembl start_codon     266     268     .       +       0       gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3      ensembl stop_codon      21553   21555   .       +       0       gene_id "ENSSASG00005000002"; gene_version "1"; 
transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3      ensembl gene    266     13483   .       +       .       gene_id "ENSSASG00005000003"; gene_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding";
MN908947.3      ensembl transcript      266     13483   .       +       .       gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3      ensembl exon    266     13483   .       +       .       gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSSASE00005000003"; exon_version "1";
MN908947.3      ensembl CDS     266     13480   .       +       0       gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSSASP00005000003"; protein_version "1";
MN908947.3      ensembl start_codon     266     268     .       +       0       gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3      ensembl stop_codon      13481   13483   .       +       0       gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3      ensembl gene    21563   25384   .       +       .       gene_id "ENSSASG00005000004"; gene_version "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding";
MN908947.3      ensembl transcript      21563   25384   .       +       .       gene_id "ENSSASG00005000004"; gene_version "1"; transcript_id "ENSSAST00005000004"; transcript_version "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "S"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3      ensembl exon    21563   25384   .       +       .       gene_id "ENSSASG00005000004"; gene_version "1"; transcript_id "ENSSAST00005000004"; transcript_version "1"; exon_number "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "S"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id 
"ENSSASE00005000004"; exon_version "1";
MN908947.3      ensembl CDS     21563   25381   .       +       0       gene_id "ENSSASG00005000004"; gene_version "1"; transcript_id "ENSSAST00005000004"; transcript_version "1"; exon_number "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "S"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSSASP00005000004"; protein_version "1";
I have a reference genome :
sequence.fasta
and a bam file :
ILS_W_V_558_S2_R1_001_val.bam
that looks like this :
samtools view ILS_W_V_558_S2_R1_001_val.bam | head
NB551648:137:HN725BGXL:1:13307:9948:2995        97      NC_045512.2     37      42      36M     =       17969   17968   
CAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACG    AAAAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEE    AS:i:0      XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:36 YS:i:0  YT:Z:DP
NB551648:137:HN725BGXL:1:12301:12947:7621       97      NC_045512.2     39      42      36M     =       28343   28340   ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA    AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE    AS:i:0      XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:36 YS:i:0  YT:Z:DP
NB551648:137:HN725BGXL:1:13104:10907:7606       161     NC_045512.2     39      42      36M     =       28306   28303   ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA    AAAAAEEEEEEEEEEEAEEEEAEEEEEEEEEEEEEE    AS:i:0      XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:36 YS:i:0  YT:Z:DP
NB551648:137:HN725BGXL:1:21308:15644:16840      161     NC_045512.2     39      42      36M     =       28401   28398   ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA    AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE    AS:i:0      XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:36 YS:i:0  YT:Z:DP
NB551648:137:HN725BGXL:1:22104:26696:1445       99      NC_045512.2     39      42      36M     =       99      96      ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA    AAAAAEEAEEEEEA/EEEEEEAEEEEEEAEEEEAEA    AS:i:0      XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:36 YS:i:0  YT:Z:CP
NB551648:137:HN725BGXL:1:23201:19089:2498       161     NC_045512.2     39      42      36M     =       28293   28290   ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA    AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE    AS:i:0      XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:36 YS:i:0  YT:Z:DP
NB551648:137:HN725BGXL:1:23206:4111:17048       161     NC_045512.2     39      42      36M     =       28334   28331   ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA    /AAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE    AS:i:0      XN:i:0  XM:i:0  XO:i:0  XG:i:0  NM:i:0  MD:Z:36 YS:i:0  YT:Z:DP
I want to see gene specific coverage from the bam file. The result should have the gene_name in 1st column and its Coverage in 2nd column
I tried this command :
import subprocess
# Parse the GTF file and extract gene information
def parse_gtf(gtf_file):
    gene_info = {}
    with open(gtf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.strip().split('\t')
            if parts[2] == 'gene':
                gene_id = parts[8].split(';')[0].split('"')[1]
                gene_name = parts[8].split(';')[3].split('"')[1]
                gene_info[gene_id] = gene_name
    return gene_info
# Calculate gene-specific coverage from the BAM file
def calculate_gene_coverage(gene_info, bam_file):
    gene_coverage = {}
    for gene_id, gene_name in gene_info.items():
        region = f'{gene_id}'
        command = f'samtools depth -r {region} {bam_file}'
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        if result.returncode == 0:
            coverage = sum(int(line.split()[2]) for line in result.stdout.strip().split('\n'))
            gene_coverage[gene_name] = coverage
    return gene_coverage
#main_command
def main():
   gtf_file = 'Sars_cov_2.ASM985889v3.101.gtf'
   bam_file = 'ILS_W_V_558_S2_R1_001_val.bam'
    # Parse GTF file to get gene information
    gene_info = parse_gtf(gtf_file)
    # Print gene information
    print("Gene Information:")
    for gene_id, gene_name in gene_info.items():
        print(f"{gene_id}: {gene_name}")
    # Calculate gene-specific coverage
    gene_coverage = calculate_gene_coverage(gene_info, bam_file)
    # Output gene-specific coverage
    print("\nGene-specific Coverage:")
    for gene_name, coverage in gene_coverage.items():
        print(f'{gene_name}\t{coverage}')
if __name__ == "__main__":
    main()
I have got this output :
Gene Information:
ENSSASG00005000002: ensembl
ENSSASG00005000003: ensembl
ENSSASG00005000004: ensembl
ENSSASG00005000006: ensembl
ENSSASG00005000010: ensembl
ENSSASG00005000007: ensembl
Gene-specific Coverage:
I am unable to understand this.
Please help.
Also if is there any other quick way to find gene specific coverage from the bam file then please share. I dont know the gene location co-ordinates.
Thank you
This is not going to work no matter what program you use since the chromosome identifiers in your GTF (
MN908947.3) do not match your BAM file (NC_045512.2). So find a annotation file that matches your reference genome."Also if is there any other quick way to find gene specific coverage" can you give some example so that its bit more clear what do you mean by gene specific coverage ?
Thank you, here I am giving you some example , suppose I have bam files .
now I want to see Spike gene coverage in each sample.
I want output like