Source code for dammit.fileio.base

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

from itertools import count
from sys import stderr
import pandas as pd


[docs]class EmptyFile(Exception):
    pass


[docs]def warn_empty(msg):
    '''Warn that a file is empty.'''
    print('\nWARNING: Empty file: {0}\n'.format(msg), file=stderr)


[docs]def next_or_raise(fp):
    '''Get the next line and raise an exception if its empty.
    '''
    counter = count()
    def func(raise_exc=True):
        line = fp.readline()
        n = next(counter)
        if raise_exc is True and line == '':
            raise RuntimeError('Malformed file (line {0})'.format(n))
        return line
    return func


[docs]def convert_dtypes(df, dtypes):
    '''Convert the columns of a DataFrame to the types specified
    in the given dictionary, inplace.

    Args:
        df (DataFrame): The DataFrame to convert.
        dtypes (dict): Dictionary mapping columns to types.
    '''

    for c in df.columns:
        try:
            df[c] = df[c].astype(dtypes[c])
        except KeyError:
            pass


[docs]class BaseParser(object):

    def __init__(self, filename):
        self.filename = filename

[docs]    def raise_empty(self):
        raise EmptyFile('Empty file: {0}'.format(self.filename))


[docs]class ChunkParser(BaseParser):

    def __init__(self, filename, chunksize=10000):
        '''
        Args:
            filename (str): Path to the file to parse.
            chunksize (int): Number of entries to yield per call.
        '''

        self.chunksize = chunksize
        super(ChunkParser, self).__init__(filename)

    def __iter__(self):
        raise NotImplementedError()
        yield

[docs]    def read(self):
        '''Read the entire file at once and return as a single DataFrame.
        '''
        try:
            return pd.concat(self, ignore_index=True)
        except (EmptyFile, ValueError) as e:
            # no objects, return an empty dataframe
            return self.empty()

[docs]    def empty(self):
        '''Get an empty DataFrame with the appropriate columns.
        '''

        df = pd.DataFrame(columns=[k for k, _ in self.columns])
        convert_dtypes(df, dict(self.columns))
        return df