Question: How to reduce a Python 2.7 dataframe from float 64 to float 8 and then plot the time series data?
gravatar for tfhahn
3.0 years ago by
tfhahn50 wrote:

I need help to cut down on computation time. I have been trying in vain for the past 10 hours to use only 7 instead of 63 decimals. I keep getting the error messages that there is nothing to plot. From my limited understanding of Python 2.7, I believe that the plotting function is expecting float64 instead of float8. I have copy-pasted my Python 2.7 code below. Can somebody please change it so that it will plot float8 time series trajectories? Thanks a lot in advance.

from __future__ import print_function
import os
import timeit
import argparse

import pandas as pd
import matplotlib.pyplot as plt

import gpl
import conf.settings
from util import count_samples
from correlation import CorrelationMatrix

class ExpressionMatrix(object):
    def __init__(self, platform=None, series=None, invert=False, limit=0, 
                 top=10, **kwargs):
        data_path = conf.settings.DATA_PATH
        self.sample_number = 0
        self.invert = invert = top

        if series:
            file_path = os.path.join(data_path, series+'.csv')
            self.df = pd.read_csv(file_path, index_col=0)
            sample_number = count_samples(self.df)


            self.df.iloc[:,:sample_number] = self.df.iloc[:,:sample_number].astype('float32')

        elif platform:
            count = 0
            platform = gpl.Platform(args.platform, parse=False, meta_only=True)
            series = platform.get_series(download=False)

            for index, dataset in enumerate(series):
                file_path = os.path.join(data_path, dataset+'.csv')
                if not os.path.exists(file_path): 
                    file_path = os.path.join(data_path, dataset+'.tar.csv')
                    if not os.path.exists(file_path):
                df = pd.read_csv(file_path, index_col=0)
                count += 1

                sample_number = count_samples(df)
                expression_matrix = df.iloc[:,:sample_number]

                if count == 1:
                    matrix = expression_matrix
                    matrix = pd.concat([matrix, expression_matrix], axis=1)
                    print('Concated matrix: %s' % dataset, matrix.shape)
                if limit:
                    if count > limit:
            annotations = df.iloc[:,sample_number:]
            self.df = pd.concat([matrix, annotations], axis=1)
        self.sample_number = count_samples(self.df)

        for key, value in kwargs.items():
            setattr(self, key, value)

        if self.unlog:
           self.df.iloc[:,:sample_number] = 2**self.df.iloc[:,:sample_number]

    def correlations(self):
        return CorrelationMatrix(self)

def main(args):
    expressions = ExpressionMatrix(**vars(args))
    if args.load:
        correlations = CorrelationMatrix(expressions, calc=False)
        correlations = expressions.correlations()

    times = []
    if args.choices:
        for i in range(args.trials):
            start_time = timeit.default_timer()
            stop_time = timeit.default_timer()
            difference = stop_time - start_time
    print('Average duration: ', sum(times)/len(times))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--series', '-s', type=str)
    parser.add_argument('--platform', '-p', type=str)
    parser.add_argument('--invert', '-i', action='store_true')
    parser.add_argument('--choices', '-c', type=str, nargs='+', default='')
    parser.add_argument('--limit', '-l', type=int,  default=0)
    parser.add_argument('--top', '-t', type=int,  default=10)
    parser.add_argument('--similarity', '-sim', type=str, default='pearson',
                        help='''Method of similarity measure which can be either pearson, kendall, spearman (default: pearson).''')
    parser.add_argument('--trials', '-tr', type=int,  default=1)
    parser.add_argument('--plot', '-plt', action='store_true')
    parser.add_argument('--unlog', '-ul', action='store_true')
    parser.add_argument('--save', '-sa', action='store_true')
    parser.add_argument('--load', '-lo', action='store_true')
    args = parser.parse_args()
float dataframe python • 1.2k views
ADD COMMENTlink modified 3.0 years ago by GenoMax96k • written 3.0 years ago by tfhahn50

First, I'm not aware of a float8 datatype in numpy. Second, in a minimal example, I wasn't able to recreate the issue with float16:

simple pandas + numpy example

Maybe it might be worth adding some example input data for others to test with.

ADD REPLYlink modified 3.0 years ago • written 3.0 years ago by mmfansler370

Why would you want to reduce the float size? As far as I'm aware there's no performance difference using 64 bit floating point numbers other than the amount of memory required to store them. All reducing your float size will do is increase your rounding errors.

If you computation is too slow, the problem is more likely the algorithm or the size of the dataset.

ADD REPLYlink modified 3.0 years ago • written 3.0 years ago by Joe19k
Please log in to add an answer.


Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 2161 users visited in the last hour