I have been stuck with some script!
Well i made this script in 2008 and now i am using with some modifications and i get error!
#!/usr/bin/perl -w use strict; use Data::Dumper; sub getSequences ($) { my $file = $_[0]; open (INFILE, "<$file") || die "Error1 in opening in file: $file. $!\n"; my @lines = <INFILE>; my $header; my %header2seq = (); foreach my $line (@lines) { chomp $line; if ($line =~ /^(>.+)$/o) { $header = $1; } else {$header2seq {$header} .= $line; } } #print %header2seq; close (INFILE); return (\%header2seq); } sub MakeSpList ($) { my $sp_list = $_[0]; my %sp_names = (); open (INFILE2, "<$sp_list") || die "Error2 in opening in file: $sp_list. $!\n"; my @sps = <INFILE2>; foreach my $line (@sps) { chomp $line; $sp_names {$line} = 1; } close (INFILE2); #print Dumper (%sp_names); return (\%sp_names); } sub CompareSpList2Sequences ($$$) { my $ref_header2seq = $_[0] ; my $ref_sp_names = $_[1]; my $file = $_[2]; open (OUTFILE, ">$file.subdata") || die ("Error3 in opening out file: $file.subdata. $!\n"); foreach my $key (keys %$ref_header2seq) { $key =~ m/^>([A-Z]+[0-9]+[A-Z+]).+$/o; #print "$1\n"; my $header_sub = $1; #print $header_sub, "\n"; #print $ref_sp_names, "\n"; if (exists $ref_sp_names -> {$header_sub}) { my $seq = $ref_header2seq -> {$key}; print OUTFILE ">$key\n$seq\n"; } } close (OUTFILE); return "42"; } my $fasta_seqs = $ARGV[0]; my $sp_list = $ARGV[1]; my $ref_header2seq = getSequences ($fasta_seqs); my $ref_sp_names = MakeSpList ($sp_list); CompareSpList2Sequences ($ref_header2seq , $ref_sp_names, $fasta_seqs); exit;
What i want to do is:
i have a fasta file with sequences:
>YAL004W YAL004W SGDID:S000002136, Chr I from 140760-141407, Genome Release 64-2-1, Dubious ORF, "Dubious open reading frame; unlikely to encode a functional protein, based on available experimental and comparative sequence data; completely overlaps verified gene SSA1/YAL005C" ATGGGTGTCACCAGCGGTGGCCTTAACTTCAAAGATACCGTCTTCAATGGACAACAAAGAGACATCGAAAGTACCACCACCCAAGTCGAAAATCAAGACGTGTTCTTCCTTACCCTTCTTGTCCAAACCGTAAGCAATGGCAGCGGCGGTAGGTTCGTTAATAATACGCAAGACATTCAAACCAGCAATGGTACCAGCATCCTTGGTAGCTTGTCTTTGAGAATCGTTGAA
>YAL005C SSA1 SGDID:S000000004, Chr I from 141431-139503, Genome Release 64-2-1, reverse complement, Verified ORF, "ATPase involved in protein folding and NLS-directed nuclear transport; member of HSP70 family; forms chaperone complex with Ydj1p; localized to nucleus, cytoplasm, and cell wall; 98% identical with paralog Ssa2p, but subtle differences between the two proteins provide functional specificity with respect to propagation of yeast [URE3] prions and vacuolar-mediated degradations of gluconeogenesis enzymes; general targeting factor of Hsp104p to prion fibrils" ATGTCAAAAGCTGTCGGTATTGATTTAGGTACAACATACTCGTGTGTTGCTCACTTTGCTAATGATCGTGTGGACATTATTGCCAACGATCAAGGTAACAGAACCACTCCATCTTTTGTCGCTTTCACTGACACTGAAAGATTGATTGGTGATGCTGCTAAGAATCAAGCTGCTATGAATCCTTCGAATACCGTTTTCGACGCTAAGCGTTTGATCGGTAGAAACTTCAAC
and i have another file with ID's:
>YAL005C
>YAL012W
I want to retrieve the sequences and the all header when match with ID's file.
but i get error: don't print nothing!
Please can you help me?
Thanks in advance.
- i already searched for other methods (and i can´t get the results either) but i really want to know about this error!
- no bioperl please!