Source code for dammit.fileio.infernal

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

import pandas as pd
from dammit.fileio.base import convert_dtypes, ChunkParser


[docs]class InfernalParser(ChunkParser):

    columns = [('target_name', str),
               ('target_accession', str),
               ('query_name', str),
               ('query_accession', str),
               ('mdl', str),
               ('mdl_from', int),
               ('mdl_to', int),
               ('seq_from', int),
               ('seq_to', int),
               ('strand', str),
               ('trunc', str),
               ('pass', str),
               ('gc', float),
               ('bias', float),
               ('score', float),
               ('e_value', float),
               ('inc', str),
               ('description', str)]
    
    def __init__(self, filename, **kwargs):
        super(InfernalParser, self).__init__(filename, **kwargs)

    def __iter__(self):
        '''Yields DataFrames of length chunksize from a given
        cmscan result file.

        The format uses 1-based, fully open intervals; when the strand
        is negative, the start coordinate is larger than the end. 
        Truly Infernal.

        We convert to proper 0-based, half-open, ordered intervals.

        Yields:
            DataFrame: Pandas DataFrame with the cmscan hits.
        '''

        data = []
        n_entries = 0
        with open(self.filename) as fp:
            for ln in fp:
                ln = ln.strip()
                if not ln or ln.startswith('#'):
                    continue
                tokens = ln.split()
                data.append(tokens[:len(self.columns)-1] + \
                            [' '.join(tokens[len(self.columns)-1:])])
                n_entries += 1
                if len(data) >= self.chunksize:
                    yield self._build_df(data)
                    data = []

        if n_entries == 0:
            self.raise_empty()
        if data:
            yield self._build_df(data)

    def _build_df(self, data):
        if not data:
            self.raise_empty()
        df = pd.DataFrame(data, columns=[k for k, _ in self.columns])
        convert_dtypes(df, dict(self.columns))
        # fix the evil coordinate system
        sidx = df.seq_from > df.seq_to
        df.loc[sidx, 'seq_from'], df.loc[sidx, 'seq_to'] = \
            df.loc[sidx, 'seq_to'], df.loc[sidx, 'seq_from']
        df.mdl_from = df.mdl_from - 1
        df.seq_from = df.seq_from - 1
        return df