Question

Reduce time execution

1

Entering edit mode

7.0 years ago

Amy • 0

Hi guys,

I am currently writing a perl script which compares two directories containing fasta files. The first contains all fasta sequences grouped by their location on the chromosomes. The second directory contains fasta consensus sequences taken from each file of the first directory. What I want is to get in the same file:

Name of each consensus name (LOC126... for example)
Length of each consensus sequence
Name of each CDS used to make one consensus sequence (XM... for example)
Length of each CDS.

I have written a code which works well. The bad side is that it takes two much time to execute. My question is: Is there any way to accelerate its execution ? i have several loops in my script and since I am new to perl I suppose that maybe i am not using the more accurate solution.

Thank you !

My code is above:

#!/usr/local/perl-5.24.0/bin/perl 
 if (scalar @ARGV < 2)
{ 
    print " Number or arguments not sufficient, please read the usage !\n\n
            Usage : perl FilterConsensus.pl  -d LOCfilesDirectory -c ConsensusDirectory\n\n
            ************************** Mandatory arguments: ****************************\n\n
            -d Complete path to your LOCfiles that contains your different CDS per LOCATION (exemple: -d home/your_username/Data/LOCFILES)\n\n
            -c Complete path to your CONSENSUS files.  (exemple: -c home/your_username/Data/CONSENSUS)\n\n
            ***************************************************************************************************\n\n";
    exit;
}
my ($locdir, $consdir);
GetOptions( "d|repertoire=s"=>\$locdir,
            "c|consensus=s"=>\$consdir);
my $master_dir = "/home/andiaye/Data/EPIALTER/Données/RESUME";
mkdir $master_dir, 0755; 
my @consfiles = GetFilesList($consdir);
my @locfiles = GetFilesList($locdir);
my (%hashIDLength, $consfilename, $locfilename, $consID, $conslength);
my $resumefile = "$master_dir/resume.txt";
open my $fic ,'>', $resumefile or die "Cannot open $resumefile\n";
foreach my $consfile(@consfiles){
    $consfilename = Getfilename($consfile);
    ($consID, $conslength)=GetConsIDLength($consfile);
    foreach my $locfile(@locfiles){
        $locfilename = Getfilename($locfile);
        %hashIDLength = GetLocIDLength($locfile);
        while ((my $key, my $value)=each %hashIDLength){
            if ($consfilename=~$locfilename){   
            printf $fic ("%20s %20s %30s %20s\n", "Locus: $consID","Taille Locus: $conslength","CDS:$key", "Taille CDS:$value");
            }
        }
    }
}
close($fic);

#~~~~~~~~~~~~~Functions~~~~~~~~~~~~~#
sub GetFilesList{
        my $Path = $_[0];
        my $FileFound;
        my @FilesList=();
        opendir (my $FhRep, $Path) or die "Can't open directory $Path\n";
        my @Contenu = grep { !/^\.\.?$/ } readdir($FhRep);
        closedir ($FhRep);

        foreach my $FileFound (@Contenu) {
                if ( -f "$Path/$FileFound") {
                        push ( @FilesList, "$Path/$FileFound" );
                }
                elsif ( -d "$Path/$FileFound") {
                        push (@FilesList, GetFilesList("$Path/$FileFound"));
                }
        }
return @FilesList;
}

sub GetConsIDLength {
    my $file = shift @_;
    my ($ID, $length);
    open my $fic ,'<', $file or die "Cannot open $file\n";
    while (my $line = <$fic>){
        chomp $line;
        if ($line =~ m/^>/){
            $ID = (split(m/>/,$line))[1];
            next;
        } 
        else {
            $length += length($line);
        }
    }
    return ($ID, $length); 
    close($fic);
}

sub GetLocIDLength {
    my $file = shift @_;
    my ($ID, $length);
    my %hashIDLength;
    open my $fic ,'<', $file or die "Cannot open $file\n";
    while (my $line = <$fic>){
        chomp $line;
        if ($line =~ m/^>/){
            my @attributs = split(m/ /,$line);
            my $IDsign = $attributs[0];
            $ID = (split(m/>/,$IDsign))[1];
            next;
        } 
        else {
            $length += length($line);
        }
        $hashIDLength{$ID} = $length;
    }
    return (%hashIDLength); 
    close($fic);
}

sub Getfilename {
    my $path = shift @_;
    my($base, $pathe, $ext) = fileparse("$path", '\..*');
    return $base;
}
exit;

sequence RNA-Seq perl • 1.7k views

ADD COMMENT • link updated 7.0 years ago by Michael 54k • written 7.0 years ago by Amy • 0

score 3 · Accepted Answer · 2017-04-28

foreach my $consfile(@consfiles){
$consfilename = Getfilename($consfile);
($consID, $conslength)=GetConsIDLength($consfile);
   foreach my $locfile(@locfiles){ ### change this logic of nested foreach

This is the main problem, that consumes time for no reason (well you save mem by this). Now, your fileIO complexity is O(N*M), while it should be O(N+M). Reduce File IO by making sure you read each file only once, and parse the information you need into a data structure based on hashes.

There is absolutely no need to open any file more than once, therefore you can code this in a way that doesn't use nested foreach. Instead, parse the locfiles first.

## fragment, untested, please fix minor issues yourself

### read locfile ids into nested hash
 %hashIDLengths = ();
  foreach my $locfile(@locfiles){
        $locfilename = Getfilename($locfile);
        $hashIDLengths{$locfilename} = GetLocIDLength($locfile); 
   }

### parse consfiles:


foreach my $consfile(@consfiles){
    $consfilename = Getfilename($consfile);
    ($consID, $conslength)=GetConsIDLength($consfile);
     ## not sure how filename matching was supposed to work, now the filenames must be exactly the same  
        while (my ( $key, $value) = each %{$hashIDLengths{$consfilename}}){

            printf $fic ("%20s %20s %30s %20s\n", "Locus: $consID","Taille Locus: $conslength","CDS:$key", "Taille CDS:$value");

        }
    }
}



 In function GetLocIDLength: change

   -   return (%hashIDLength); 
   +  return [%hashIDLength]; # return a scalar to store as nested hash