Question: Question about ensembl script
1
gravatar for Ameli
3.5 years ago by
Ameli 10
European Union
Ameli 10 wrote:

Hey all,

This script, connect to Ensembl API, gets the 500 bases upstream from the Transcription Start Site for Human transcript ID's.

How can i generalize this scripts to get the 500 bp for any organism?like: ENSCAFT00000017399, ENSPVAT00000009012, ENSECAT00000019626,ENSMPUT00000010176

#!/usr/bin/perl
use strict;
use warnings;
use Bio::EnsEMBL::Registry;

my $reg = 'Bio::EnsEMBL::Registry';

$reg->load_registry_from_db(
    -host => 'ensembldb.ensembl.org',
    -user => 'anonymous'
);


my $gene_adaptor = $reg->get_adaptor( "human", "core", "gene" );
# my $gene = $gene_adaptor->fetch_by_stable_id("ENST00000269844");
my $transcript_adaptor =  $reg->get_adaptor( 'Human', 'Core', 'Transcript' );
my $gene = $transcript_adaptor->fetch_by_stable_id("ENST00000269844");

#  get the slice adaptor
my $slice_adaptor = $reg->get_adaptor( "human", "core", "slice" );

# get the chromosome
my $chr = $gene->seq_region_name;

# one option if the gene is positive strand
if ($gene->seq_region_strand == 1){
    # In Ensembl, the "gene start" is always the smaller coordinate and the "gene end" is always the larger coordinate, regardless of the gene strand
    # this means that on the positive strand gene, the "gene start" will be the 5' end
    # to get the 500bp upstream, we want to end with the gene start, and start at gene start - 500
    my $slice_end = $gene->seq_region_start;
    my $slice_start = $slice_end - 500;


    my $slice = $slice_adaptor->fetch_by_region( "chromosome", $chr, $slice_start, $slice_end );
  my $sequence = $slice->seq();
  my $coord_sys  = $slice->coord_system()->name();
my $seq_region = $slice->seq_region_name();
my $start      = $slice->start();
my $end        = $slice->end();
my $strand     = $slice->strand();

print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
  print "$sequence\n";
    }

# another option if the gene is negative strand
else {

    # on the negative strand gene, the "gene end" will be the 5' end
    # to get the 500bp upstream, we want to start with the gene end and end at gene end + 500
    my $slice_start = $gene->seq_region_end ;
    my $slice_end = $slice_start + 500;

    my $slice = $slice_adaptor->fetch_by_region( "chromosome", $chr, $slice_start, $slice_end );
  my $sequence = $slice->seq();
  my $coord_sys  = $slice->coord_system()->name();
my $seq_region = $slice->seq_region_name();
my $start      = $slice->start();
my $end        = $slice->end();
my $strand     = $slice->strand();

print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
  print "$sequence\n";

    }
sequence ensembl genome • 1.1k views
ADD COMMENTlink modified 3.5 years ago by Emily_Ensembl19k • written 3.5 years ago by Ameli 10

Try to write a subroutine.

ADD REPLYlink written 3.5 years ago by geek_y9.8k
3
gravatar for alessandrotestori7
3.5 years ago by
alessandrotestori7330 wrote:

you can declare three variables: $species, $transcript and $toplevel; you then modify these each time. I have pasted a working script in the case of ENSMPUT00000010176

use strict;
use Bio::EnsEMBL::Registry;

my $species = "ferret";
my $transcript = "ENSMPUT00000010176";
 my $toplevel = "scaffold";

my $reg = 'Bio::EnsEMBL::Registry';

$reg->load_registry_from_db(    -host => 'ensembldb.ensembl.org',    -user => 'anonymous');
my $gene_adaptor = $reg->get_adaptor( $species, "core", "gene" );
my $transcript_adaptor =  $reg->get_adaptor( $species, 'Core', 'Transcript' );
my $gene = $transcript_adaptor->fetch_by_stable_id($transcript);
my $slice_adaptor = $reg->get_adaptor( $species, "core", "slice" );
my $chr = $gene->seq_region_name;
if ($gene->seq_region_strand == 1){
    my $slice_end = $gene->seq_region_start;
    my $slice_start = $slice_end - 500;
    my $slice = $slice_adaptor->fetch_by_region( "$toplevel", $chr, $slice_start, $slice_end );
    my $sequence = $slice->seq();
    my $coord_sys  = $slice->coord_system()->name();
    my $seq_region = $slice->seq_region_name();
    my $start = $slice->start();
    my $end = $slice->end();
    my $strand = $slice->strand();
    print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
    print "$sequence\n";}else{
    my $slice_start = $gene->seq_region_end ;
    my $slice_end = $slice_start + 500;
    my $slice = $slice_adaptor->fetch_by_region( "$toplevel", $chr, $slice_start, $slice_end );
    my $sequence = $slice->seq();
    my $coord_sys  = $slice->coord_system()->name();
    my $seq_region = $slice->seq_region_name();
    my $start = $slice->start();
    my $end = $slice->end();
    my $strand = $slice->strand();
    print "Slice: $coord_sys $seq_region $start-$end ($strand)\n";
    print "$sequence\n";
}
ADD COMMENTlink modified 3.5 years ago by geek_y9.8k • written 3.5 years ago by alessandrotestori7330
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 1993 users visited in the last hour