Source code for dammit.fileio.base

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

from itertools import count
from sys import stderr
import pandas as pd


[docs]class EmptyFile(Exception): pass
[docs]def warn_empty(msg): '''Warn that a file is empty.''' print('\nWARNING: Empty file: {0}\n'.format(msg), file=stderr)
[docs]def next_or_raise(fp): '''Get the next line and raise an exception if its empty. ''' counter = count() def func(raise_exc=True): line = fp.readline() n = next(counter) if raise_exc is True and line == '': raise RuntimeError('Malformed file (line {0})'.format(n)) return line return func
[docs]def convert_dtypes(df, dtypes): '''Convert the columns of a DataFrame to the types specified in the given dictionary, inplace. Args: df (DataFrame): The DataFrame to convert. dtypes (dict): Dictionary mapping columns to types. ''' for c in df.columns: try: df[c] = df[c].astype(dtypes[c]) except KeyError: pass
[docs]class BaseParser(object): def __init__(self, filename): self.filename = filename
[docs] def raise_empty(self): raise EmptyFile('Empty file: {0}'.format(self.filename))
[docs]class ChunkParser(BaseParser): def __init__(self, filename, chunksize=10000): ''' Args: filename (str): Path to the file to parse. chunksize (int): Number of entries to yield per call. ''' self.chunksize = chunksize super(ChunkParser, self).__init__(filename) def __iter__(self): raise NotImplementedError() yield
[docs] def read(self): '''Read the entire file at once and return as a single DataFrame. ''' try: return pd.concat(self, ignore_index=True) except (EmptyFile, ValueError) as e: # no objects, return an empty dataframe return self.empty()
[docs] def empty(self): '''Get an empty DataFrame with the appropriate columns. ''' df = pd.DataFrame(columns=[k for k, _ in self.columns]) convert_dtypes(df, dict(self.columns)) return df