I have the following program that I am analyzing and trying to understand. Overall purpose of it is to put out, based on the option selected, the set of introns, exons, acceptor sites , or donor sites of the sequence file passed in. I understand what the code is doing for getting the exons and the introns but I am not sure how to get the acceptor and donor sites(see the questions marks). More specifically, getting the start and end point of these sites. The original author had some comments placed but those don't make sense either(to me).
The comments:
"# donor i starts at (exon i)'s -10 to -1"
"# donor i ends at (intron i+1)'s 1 to 20"
"# acceptor i starts at (intron i)'s -20 to -1"
"# acceptor i ends at (exon i+1)'s 1 to 10"
Can anyone help me understand ? Thank you so much !
#!/usr/bin/perl -w
use strict;
use lib '/data/biocs/b/bio425/bioperl-live';
use Bio::SeqIO;
use Data::Dumper;
use Getopt::Std;
# Part 1. Read 4 options & genbank file
my %opts;
getopts("eida", \%opts); # process options
die "Usage: $0 [-e(xon) -i(ntron) --d(onor) --a(cceptor)] <a GenBank file>\n" unless @ARGV == 1 && ($opts{e} ||$opts{i} || $opts{d} || $opts{a}); # print usage if no input file or no options given
my $gb = shift @ARGV;
my $in = Bio::SeqIO->new(-file=>$gb, -format=>'genbank');
my $seq = $in->next_seq();
my (@exons, @introns, @donors, @acceptors); # declare arrays as data contains
# Part 2. Extract exon information
foreach my $feat ( $seq->get_SeqFeatures() ) { # loop through each genbank feature
next unless $feat->primary_tag() eq "mRNA"; # skip unless the feature is tagged as "mRNA"
my $location_obj = $feat->location(); # splice coordinates saved as a Bio::Location::Split object
my $exon_id = 1;
foreach my $loc ($location_obj->each_Location) { # access each exon coords object
push @exons, {
'order' => $exon_id++,
'start' => $loc->start,
'end' => $loc->end
};
}
}
# print Dumper(\@exons); exit; # uncomment to see results of parsed exons
# Part 3. Calcuate intron, donor, acceptor objects
## Extract introns coords:
for (my $i=0; $i<$#exons; $i++) { # for each exon
push @introns, {
'order' => $exons[$i]->{order},
'start' => $exons[$i]->{end}+1, # intron i starts at (exon i)'s end + 1
'end' => $exons[$i+1]->{start}-1, # intron i ends at (exon i+1)'s start -1
};
}
# print Dumper(\@introns); exit; # uncomment to see introns
## Donor sites (-10 to -1 of an exon and 1-20 of intron)
for (my $i=0; $i<$#exons; $i++) {
push @donors, {
'order' => $exons[$i]->{order},
'start' => $exons[$i]->{end} - ?, # donor i starts at (exon i)'s -10 to -1
'end' => $introns[$i]->{start} + ?, # donor i ends at (intron i+1)'s 1 to 20
};
}
# print Dumper(\@donors); exit; # uncomment to see donor sequences
## Acceptor sites (-20 to -1 of an intron and 1-10 of exon)
for (my $i=0; $i<$#exons; $i++) {
push @acceptors, {
'order' => $exons[$i]->{order},
'start' => $introns[$i]->{end} - ?, # acceptor i starts at (intron i)'s -20 to -1
'end' => $exons[$i+1]->{start} + ?, # acceptor i ends at (exon i+1)'s 1 to 10
};
}
# print Dumper(\@acceptors); exit; # uncomment to see acceptor sequences
# Part 4. Output sequences
&print_seq("intron", \@introns) if $opts{i};
&print_seq("exon", \@exons) if $opts{e};
&print_seq("donor", \@donors) if $opts{d};
&print_seq("acceptor", \@acceptors) if $opts{a};
exit;
sub print_seq {
my ($tag, $ref) = @_;
my @bins = @$ref;
foreach my $bin (@bins) {
my $out = Bio::SeqIO->new(-format=>'fasta');
my $seq =$seq->trunc(?, ?);
$seq->id($tag . "_" . $bin->{order});
$out->write_seq($seq);
}
}