Hello everybody ,exuse my horrible english.
I m writing a perl script to know what is the effect of snp on a given gene, my work plan was to extract all the exon sequences , translate them to amino acid ,concatenate , then doing the some thing with changing only the sequence of the exon where snp is located , finally to align the two amino acid sequence to have something like that...R490W...where R is the wild amino acid , 490 the postion an W the new amino acid.... My problem is that my obtained sequence is totally differente from those given on protein db (i used ncbi to verify), i know i m missing something in the translation step but i don t know what .
first part of the script (gene eg :MYBPC3):
use strict;
use warnings ;
use Bio::Das;
use Bio::Perl;
use Bio::DB::GenBank;
my $protein="";
my $das = Bio::Das->new(-source => 'http://genome.cse.ucsc.edu/cgi-bin/das/',-dsn=>'hg18');
my $snp=47310331;
my $chr = "chr11";
my @exon_start=
(47309532,47309971,47310198,47310692,47310940,47311320,47311683,47312048,47313168,47314003,
47315517,47315816,47316646,47317450,47317777,47319129,47319264,47320117,47320704,47320956,4
7321147,47321618,47324333,47324753,47325548,47325777,47325983,47326550,47327900,47328140,47
328628,47329365,47330749);
my @exon_end=(47309843,47310008,47310385,47310829,47311100,47311460,47311879,47312137,47313336,47314138,
47315706,47315921,47316806,47317531,47317917,47319159,47319371,47320283,47320871,47321062,4
7321275,47321751,47324497,47324771,47325606,47325807,47326032,47326668,47328049,47328239,47
328742,47329632,47330829);
my $exon_count=33;
for (my $i=0;$i<$exon_count;$i++){
        my $segment = $das->segment("$chr\:$exon_start[$i]\,$exon_end[$i]");
        my $dna = $segment->dna;
        my $aa_seq = Bio::Perl::translate($dna);
        $protein.=$aa_seq->seq;
}
print $protein ;
My output:
VSSL*LHLSFIAQ*TLGRHSRPERPVPRHCFLRPPSFYPKDPGASFRSPVDQSVQHPLRTARQLPC*SPIAAQETHLSHIHPTVGRGFPNF
PPGSWHGAGIRLYPGHPQEPAWSLRPRTSRRHSHRASPCKLVALQT*MPPSKGQGFLISRVNTPCLLNMRKRASSPRSRPFLNQEILGPWG
YPGQHSRA*QCSPR*PSGSPGAG*SLGPRRSPGPYSWVAHR*CPGLGIKTGSLVVAALSLKPTIFWLKTRK**PLPMMSSGTTQWVRR*CS
KTVNHSPWSSCRLSALCTPRAPCCRHPGVASTPEPH*DPRRQSPGDPGEDLACPTTCSTSVALSSMFSMRTVTW*VPECTRRAARMNRMVS
VGLLRMLTSSPARGCPSLVQVT*GRGLPPGKG*EGSQAPRPSSEWSGAGAWAAEAVAVRRISCTVTGSVVVTGAPGPAILCARTRKSSRAP
VGRSFTSIDVCSVSPCRAATHSEQPSGQYSTL*PSRPPAPTRSGGRHLRETVVSETSSTARWVGSLGGPIGMKGWEAGLGLDMPMALTA*T
RISYTTPSIMRRAS*LSS*IRSKFSRIHR*LFFFLRSRM*PRMGCPPS*AGGSHCTVQESSPTLLILGAAGASGTPMTLTVRLTWSSPTGF
FTVTV*TPSSSFSAPSTVKMLRSLVVSTRTRPSVSHSPSCQTPTHRCHLCPLGHLGLAWLGPYSPA**PSARSQWEQGPQR*GRPDVAYFQ
LQPQWCLVCGLGSPGGSWEVPCLGTKSILTPMKWSLADRLQAKPSGTKL*SASSAGVTSSMVSLWTPPDVGHLYAAVGHQLLPILQPHTPN
ILI*DLTFEHRLVLCAHHQVCDALVHLQLLPCTMSSASAWPPLVHSA*CPASSSMASLMIRWCLCPSFLNRYLKVSSRVSSTPSFSPFDLR
PLLRYFTLKLHPLPHHHQLVLQGARDEHRGLPFTKSSVLHFSPPTTHW*AASSANEHWLMVRVRLAPMDSKMYLPAAHLDLLAILEPFDLS
VMVSQFHGQPDLVAFAHLVGRLQLLLKGPVLFFSSRLMPLSLFSMPRRSVTPYWKAMRSYSDGGACRRISHTSSSAGASSFESPRGPETLT
SFSAVS*SPESQCPHGYPDPPTTSSQARAAEG**EVQVSGAHGLPWTVRLKLEQSNLSLVDTSQR*LPVKAGWASVMCSSNR*TPCWRGRS
CRAAAGAGPPCCSGPPICP*TT*PQAASGGWRRPRGLRR*CCHPTVTSPSCGRITKRPMGSSGAPGVGPLRAAELDPLGLGALSPSSAAGA
GASPGAPVASAGAGAGASMGSAFPASMTLRSNLTLEEPAMTA*DPWSAGPTSRTVSVCRVPSVARPYLLLALMSLPPRCQRTFTPARSVSA
SNTAGLPAATSTDRGFLLKAEPGFFPGSGILRDVTPGTKQAQVTQRGT
the sequence provided by ncbi
translation="MPEPGKKPVSAFSKKPRSVEVAAGSPAVFEAETERAGVKVRWQR
                     GGSDISASNKYGLATEGTRHTLTVREVGPADQGSYAVIAGSSKVKFDLKVIEAEKAEP
                     MLAPAPAPAEATGAPGEAPAPAAELGESAPSPKGSSSAALNGPTPGAPDDPIGLFVMR
                     PQDGEVTVGGSITFSARVAGASLLKPPVVKWFKGKWVDLSSKVGQHLQLHDSYDRASK
                     VYLFELHITDAQPAFTGSYRCEVSTKDKFDCSNFNLTVHEAMGTGDLDLLSAFRRTSL
                     AGGGRRISDSHEDTGILDFSSLLKKRDSFRTPRDSKLEAPAEEDVWEILRQAPPSEYE
                     RIAFQYGVTDLRGMLKRLKGMRRDEKKSTAFQKKLEPAYQVSKGHKIRLTVELADHDA
                     EVKWLKNGQEIQMSGSKYIFESIGAKRTLTISQCSLADDAAYQCVVGGEKCSTELFVK
                     EPPVLITRPLEDQLVMVGQRVEFECEVSEEGAQVKWLKDGVELTREETFKYRFKKDGQ
                     RHHLIINEAMLEDAGHYALCTSGGQALAELIVQEKKLEVYQSIADLMVGAKDQAVFKC
                     EVSDENVRGVWLKNGKELVPDSRIKVSHIGRVHKLTIDDVTPADEADYSFVPEGFACN
                     LSAKLHFMEVKIDFVPRQEPPKIHLDCPGRIPDTIVVVAGNKLRLDVPISGDPAPTVI
                     WQKAITQGNKAPARPAPDAPEDTGDSDEWVFDKKLLCETEGRVRVETTKDRSIFTVEG
                     AEKEDEGVYTVTVKNPVGEDQVNLTVKVIDVPDAPAAPKISNVGEDSCTVQWEPPAYD
                     GGQPILGYILERKKKKSYRWMRLNFDLIQELSHEARRMIEGVVYEMRVYAVNAIGMSR
                     PSPASQPFMPIGPPSEPTHLAVEDVSDTTVSLKWRPPERVGAGGLDGYSVEYCPEGCS
                     EWVAALQGLTEHTSILVKDLPTGARLLFRVRAHNMAGPGAPVTTTEPVTVQEILQRPR
                     LQLPRHLRQTIQKKVGEPVNLLIPFQGKPRPQVTWTKEGQPLAGEEVSIRNSPTDTIL
                     FIRAARRVHSGTYQVTVRIENMEDKATLVLQVVDKPSPPQDLRVTDAWGLNVALEWKP
                     PQDVGNTELWGYTVQKADKKTMEWFTVLEHYRRTHCVVPELIIGNGYYFRVFSQNMVG
                     FSDRAATTKEPVFIPRPGITYEPPNYKALDFSEAPSFTQPLVNRSVIAGYTAMLCCAV
                     RGSPKPKISWFKNGLDLGEDARFRMFSKQGVLTLEIRKPCPFDGGIYVCRATNLQGEA
                     RCECRLEVRVPQ"
If you prefer not to reinvent the wheel, you can use snpEff snpeff.sourceforge.net/ (and also ENSEMBL has a web based tool). Answering your question, you are missing 5'UTR, 3'UTR and strand information in your analysis
Hi Abdel. There are formatting options that you should use so that your question, and mostly your code, can be readable. This will ensure that more people are interested in helping you. Cheers
Merci bcp Pierre :)
geneName name chrom strand txStart txEnd cdsStart cdsEnd exonCount exonStarts exonEnds
If you prefer not to reinvent the wheel, you can use snpEff http://snpeff.sourceforge.net/ ENSEMBL has a web based tool.
Answering your question, you are missing 5'UTR, 3'UTR and strand information in your analysis.