Question

How to reduce a Python 2.7 dataframe from float 64 to float 8 and then plot the time series data?

0

Entering edit mode

6.2 years ago

tfhahn ▴ 50

I need help to cut down on computation time. I have been trying in vain for the past 10 hours to use only 7 instead of 63 decimals. I keep getting the error messages that there is nothing to plot. From my limited understanding of Python 2.7, I believe that the plotting function is expecting float64 instead of float8. I have copy-pasted my Python 2.7 code below. Can somebody please change it so that it will plot float8 time series trajectories? Thanks a lot in advance.

from __future__ import print_function
import os
import timeit
import argparse

import pandas as pd
import matplotlib.pyplot as plt

import gpl
import conf.settings
from util import count_samples
from correlation import CorrelationMatrix


class ExpressionMatrix(object):
    def __init__(self, platform=None, series=None, invert=False, limit=0, 
                 top=10, **kwargs):
        data_path = conf.settings.DATA_PATH
        self.sample_number = 0
        self.invert = invert
        self.top = top

        if series:
            file_path = os.path.join(data_path, series+'.csv')
            self.df = pd.read_csv(file_path, index_col=0)
            sample_number = count_samples(self.df)


            #print(self.df)
            print(self.df.dtypes)
            print(self.df.shape)

            self.df.iloc[:,:sample_number] = self.df.iloc[:,:sample_number].astype('float32')
            printself.df.info)

        elif platform:
            count = 0
            platform = gpl.Platform(args.platform, parse=False, meta_only=True)
            series = platform.get_series(download=False)

            for index, dataset in enumerate(series):
                file_path = os.path.join(data_path, dataset+'.csv')
                if not os.path.exists(file_path): 
                    file_path = os.path.join(data_path, dataset+'.tar.csv')
                    if not os.path.exists(file_path):
                        continue
                df = pd.read_csv(file_path, index_col=0)
                count += 1

                sample_number = count_samples(df)
                expression_matrix = df.iloc[:,:sample_number]

                if count == 1:
                    matrix = expression_matrix
                else:
                    matrix = pd.concat([matrix, expression_matrix], axis=1)
                    print('Concated matrix: %s' % dataset, matrix.shape)
                if limit:
                    if count > limit:
                        break
            annotations = df.iloc[:,sample_number:]
            self.df = pd.concat([matrix, annotations], axis=1)
        self.sample_number = count_samples(self.df)

        for key, value in kwargs.items():
            setattr(self, key, value)

        if self.unlog:
           self.df.iloc[:,:sample_number] = 2**self.df.iloc[:,:sample_number]

    def correlations(self):
        return CorrelationMatrix(self)


def main(args):
    expressions = ExpressionMatrix(**vars(args))
    if args.load:
        correlations = CorrelationMatrix(expressions, calc=False)
        correlations.load()
    else:
        correlations = expressions.correlations()
    if args.save:
        correlations.save()

    print(correlations.df.shape)
    print(args.similarity)
    times = []
    if args.choices:
        for i in range(args.trials):
            start_time = timeit.default_timer()
            correlations.correlate(args.choices)
            stop_time = timeit.default_timer()
            difference = stop_time - start_time
            times.append(difference)
    print('Average duration: ', sum(times)/len(times))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--series', '-s', type=str)
    parser.add_argument('--platform', '-p', type=str)
    parser.add_argument('--invert', '-i', action='store_true')
    parser.add_argument('--choices', '-c', type=str, nargs='+', default='')
    parser.add_argument('--limit', '-l', type=int,  default=0)
    parser.add_argument('--top', '-t', type=int,  default=10)
    parser.add_argument('--similarity', '-sim', type=str, default='pearson',
                        help='''Method of similarity measure which can be either pearson, kendall, spearman (default: pearson).''')
    parser.add_argument('--trials', '-tr', type=int,  default=1)
    parser.add_argument('--plot', '-plt', action='store_true')
    parser.add_argument('--unlog', '-ul', action='store_true')
    parser.add_argument('--save', '-sa', action='store_true')
    parser.add_argument('--load', '-lo', action='store_true')
    args = parser.parse_args()
    main(args)

Python float dataframe • 2.3k views

ADD COMMENT • link updated 6.2 years ago by GenoMax 141k • written 6.2 years ago by tfhahn ▴ 50

2

Entering edit mode

First, I'm not aware of a float8 datatype in numpy. Second, in a minimal example, I wasn't able to recreate the issue with float16:

Maybe it might be worth adding some example input data for others to test with.

ADD REPLY • link 6.2 years ago by mmfansler ▴ 450

2

Entering edit mode

Why would you want to reduce the float size? As far as I'm aware there's no performance difference using 64 bit floating point numbers other than the amount of memory required to store them. All reducing your float size will do is increase your rounding errors.

If you computation is too slow, the problem is more likely the algorithm or the size of the dataset.

ADD REPLY • link 6.2 years ago by Joe 21k