Question about ensembl script
1
1
Entering edit mode
8.1 years ago
Ameli ▴ 10

Hey all,

This script, connect to Ensembl API, gets the 500 bases upstream from the Transcription Start Site for Human transcript ID's.

How can i generalize this scripts to get the 500 bp for any organism?like: ENSCAFT00000017399, ENSPVAT00000009012, ENSECAT00000019626,ENSMPUT00000010176

#!/usr/bin/perl
use strict;
use warnings;
use Bio::EnsEMBL::Registry;

my $reg = 'Bio::EnsEMBL::Registry';

$reg->load_registry_from_db(
    -host => 'ensembldb.ensembl.org',
    -user => 'anonymous'
);


my $gene_adaptor = $reg->get_adaptor( "human", "core", "gene" );
# my $gene = $gene_adaptor->fetch_by_stable_id("ENST00000269844");
my $transcript_adaptor =  $reg->get_adaptor( 'Human', 'Core', 'Transcript' );
my $gene = $transcript_adaptor->fetch_by_stable_id("ENST00000269844");

#  get the slice adaptor
my $slice_adaptor = $reg->get_adaptor( "human", "core", "slice" );

# get the chromosome
my $chr = $gene->seq_region_name;

# one option if the gene is positive strand
if ($gene->seq_region_strand == 1){
    # In Ensembl, the "gene start" is always the smaller coordinate and the "gene end" is always the larger coordinate, regardless of the gene strand
    # this means that on the positive strand gene, the "gene start" will be the 5' end
    # to get the 500bp upstream, we want to end with the gene start, and start at gene start - 500
    my $slice_end = $gene->seq_region_start;
    my $slice_start = $slice_end - 500;


    my $slice = $slice_adaptor->fetch_by_region( "chromosome", $chr, $slice_start, $slice_end );
  my $sequence = $slice->seq();
  my $coord_sys  = $slice->coord_system()->name();
my $seq_region = $slice->seq_region_name();
my $start      = $slice->start();
my $end        = $slice->end();
my $strand     = $slice->strand();

print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
  print "$sequence\n";
    }

# another option if the gene is negative strand
else {

    # on the negative strand gene, the "gene end" will be the 5' end
    # to get the 500bp upstream, we want to start with the gene end and end at gene end + 500
    my $slice_start = $gene->seq_region_end ;
    my $slice_end = $slice_start + 500;

    my $slice = $slice_adaptor->fetch_by_region( "chromosome", $chr, $slice_start, $slice_end );
  my $sequence = $slice->seq();
  my $coord_sys  = $slice->coord_system()->name();
my $seq_region = $slice->seq_region_name();
my $start      = $slice->start();
my $end        = $slice->end();
my $strand     = $slice->strand();

print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
  print "$sequence\n";

    }
sequence genome ensembl • 2.0k views
ADD COMMENT
0
Entering edit mode

Try to write a subroutine.

ADD REPLY
3
Entering edit mode
8.1 years ago

you can declare three variables: $species, $transcript and $toplevel; you then modify these each time. I have pasted a working script in the case of ENSMPUT00000010176

use strict;
use Bio::EnsEMBL::Registry;

my $species = "ferret";
my $transcript = "ENSMPUT00000010176";
 my $toplevel = "scaffold";

my $reg = 'Bio::EnsEMBL::Registry';

$reg->load_registry_from_db(    -host => 'ensembldb.ensembl.org',    -user => 'anonymous');
my $gene_adaptor = $reg->get_adaptor( $species, "core", "gene" );
my $transcript_adaptor =  $reg->get_adaptor( $species, 'Core', 'Transcript' );
my $gene = $transcript_adaptor->fetch_by_stable_id($transcript);
my $slice_adaptor = $reg->get_adaptor( $species, "core", "slice" );
my $chr = $gene->seq_region_name;
if ($gene->seq_region_strand == 1){
    my $slice_end = $gene->seq_region_start;
    my $slice_start = $slice_end - 500;
    my $slice = $slice_adaptor->fetch_by_region( "$toplevel", $chr, $slice_start, $slice_end );
    my $sequence = $slice->seq();
    my $coord_sys  = $slice->coord_system()->name();
    my $seq_region = $slice->seq_region_name();
    my $start = $slice->start();
    my $end = $slice->end();
    my $strand = $slice->strand();
    print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
    print "$sequence\n";}else{
    my $slice_start = $gene->seq_region_end ;
    my $slice_end = $slice_start + 500;
    my $slice = $slice_adaptor->fetch_by_region( "$toplevel", $chr, $slice_start, $slice_end );
    my $sequence = $slice->seq();
    my $coord_sys  = $slice->coord_system()->name();
    my $seq_region = $slice->seq_region_name();
    my $start = $slice->start();
    my $end = $slice->end();
    my $strand = $slice->strand();
    print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
    print "$sequence\n";
}
ADD COMMENT

Login before adding your answer.

Traffic: 2070 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6