Question: How to reduce a Python 2.7 dataframe from float 64 to float 8 and then plot the time series data?
0
gravatar for tfhahn
3.0 years ago by
tfhahn50
tfhahn50 wrote:

I need help to cut down on computation time. I have been trying in vain for the past 10 hours to use only 7 instead of 63 decimals. I keep getting the error messages that there is nothing to plot. From my limited understanding of Python 2.7, I believe that the plotting function is expecting float64 instead of float8. I have copy-pasted my Python 2.7 code below. Can somebody please change it so that it will plot float8 time series trajectories? Thanks a lot in advance.

from __future__ import print_function
import os
import timeit
import argparse

import pandas as pd
import matplotlib.pyplot as plt

import gpl
import conf.settings
from util import count_samples
from correlation import CorrelationMatrix


class ExpressionMatrix(object):
    def __init__(self, platform=None, series=None, invert=False, limit=0, 
                 top=10, **kwargs):
        data_path = conf.settings.DATA_PATH
        self.sample_number = 0
        self.invert = invert
        self.top = top

        if series:
            file_path = os.path.join(data_path, series+'.csv')
            self.df = pd.read_csv(file_path, index_col=0)
            sample_number = count_samples(self.df)


            #print(self.df)
            print(self.df.dtypes)
            print(self.df.shape)

            self.df.iloc[:,:sample_number] = self.df.iloc[:,:sample_number].astype('float32')
            printself.df.info)

        elif platform:
            count = 0
            platform = gpl.Platform(args.platform, parse=False, meta_only=True)
            series = platform.get_series(download=False)

            for index, dataset in enumerate(series):
                file_path = os.path.join(data_path, dataset+'.csv')
                if not os.path.exists(file_path): 
                    file_path = os.path.join(data_path, dataset+'.tar.csv')
                    if not os.path.exists(file_path):
                        continue
                df = pd.read_csv(file_path, index_col=0)
                count += 1

                sample_number = count_samples(df)
                expression_matrix = df.iloc[:,:sample_number]

                if count == 1:
                    matrix = expression_matrix
                else:
                    matrix = pd.concat([matrix, expression_matrix], axis=1)
                    print('Concated matrix: %s' % dataset, matrix.shape)
                if limit:
                    if count > limit:
                        break
            annotations = df.iloc[:,sample_number:]
            self.df = pd.concat([matrix, annotations], axis=1)
        self.sample_number = count_samples(self.df)

        for key, value in kwargs.items():
            setattr(self, key, value)

        if self.unlog:
           self.df.iloc[:,:sample_number] = 2**self.df.iloc[:,:sample_number]

    def correlations(self):
        return CorrelationMatrix(self)


def main(args):
    expressions = ExpressionMatrix(**vars(args))
    if args.load:
        correlations = CorrelationMatrix(expressions, calc=False)
        correlations.load()
    else:
        correlations = expressions.correlations()
    if args.save:
        correlations.save()

    print(correlations.df.shape)
    print(args.similarity)
    times = []
    if args.choices:
        for i in range(args.trials):
            start_time = timeit.default_timer()
            correlations.correlate(args.choices)
            stop_time = timeit.default_timer()
            difference = stop_time - start_time
            times.append(difference)
    print('Average duration: ', sum(times)/len(times))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--series', '-s', type=str)
    parser.add_argument('--platform', '-p', type=str)
    parser.add_argument('--invert', '-i', action='store_true')
    parser.add_argument('--choices', '-c', type=str, nargs='+', default='')
    parser.add_argument('--limit', '-l', type=int,  default=0)
    parser.add_argument('--top', '-t', type=int,  default=10)
    parser.add_argument('--similarity', '-sim', type=str, default='pearson',
                        help='''Method of similarity measure which can be either pearson, kendall, spearman (default: pearson).''')
    parser.add_argument('--trials', '-tr', type=int,  default=1)
    parser.add_argument('--plot', '-plt', action='store_true')
    parser.add_argument('--unlog', '-ul', action='store_true')
    parser.add_argument('--save', '-sa', action='store_true')
    parser.add_argument('--load', '-lo', action='store_true')
    args = parser.parse_args()
    main(args)
float dataframe python • 1.2k views
ADD COMMENTlink modified 3.0 years ago by GenoMax96k • written 3.0 years ago by tfhahn50
2

First, I'm not aware of a float8 datatype in numpy. Second, in a minimal example, I wasn't able to recreate the issue with float16:

simple pandas + numpy example

Maybe it might be worth adding some example input data for others to test with.

ADD REPLYlink modified 3.0 years ago • written 3.0 years ago by mmfansler370
2

Why would you want to reduce the float size? As far as I'm aware there's no performance difference using 64 bit floating point numbers other than the amount of memory required to store them. All reducing your float size will do is increase your rounding errors.

If you computation is too slow, the problem is more likely the algorithm or the size of the dataset.

ADD REPLYlink modified 3.0 years ago • written 3.0 years ago by Joe19k
Please log in to add an answer.

Help
Access

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.
Powered by Biostar version 2.3.0
Traffic: 2161 users visited in the last hour
_