Source code for dammit.fileio.hmmer

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

import re

import pandas as pd
from dammit.fileio.base import next_or_raise, convert_dtypes, ChunkParser

[docs]class HMMerParser(ChunkParser):
    
    columns = [('target_name', str),
                ('target_accession', str),
                ('tlen', int),
                ('query_name', str),
                ('query_accession', str),
                ('query_len', int),
                ('full_evalue', float),
                ('full_score', float),
                ('full_bias', float),
                ('domain_num', int),
                ('domain_total', int),
                ('domain_c_evalue', float),
                ('domain_i_evalue', float),
                ('domain_score', float),
                ('domain_bias', float),
                ('hmm_coord_from', int),
                ('hmm_coord_to', int),
                ('ali_coord_from', int),
                ('ali_coord_to', int),
                ('env_coord_from', int),
                ('env_coord_to', int),
                ('accuracy', float),
                ('description', str)]

    def __init__(self, filename, query_regex=None, query_basename='Transcript', **kwargs):
        if query_regex is None:
            self.query_regex = re.compile(r'(?P<name>{basename}_[0-9]*)'.format(basename=query_basename))
        else:
            self.query_regex = query_regex

        super(HMMerParser, self).__init__(filename, **kwargs)

    def __iter__(self):
        '''Yields DataFrames of length chunksize from a given
        hmmscan result file.

        HMMER uses 1-based, fully open intervals. Another format of the devil.

        We convert to proper 0-based, half-open intervals.

        Args:
            fn (str): Path to the hmmscan file.
            chunksize (int): Hits per iteration.
        Yields:
            DataFrame: Pandas DataFrame with the hmmscan hits.
        '''

        data = []
        n_entries = 0
        with open(self.filename) as fp:
            for n, ln in enumerate(fp):
                if not ln or ln.startswith('#'):
                    continue

                tokens = ln.split()
                data.append(tokens[:len(self.columns)-1] + \
                            [' '.join(tokens[len(self.columns)-1:])])
                n_entries += 1
                if len(data) >= self.chunksize:
                    yield self._build_df(data)
                    data = []

        if n_entries == 0:
            self.raise_empty()
        if data:
            yield self._build_df(data)

    def _build_df(self, data):
        if not data:
            self.raise_empty()

        def split_query(item):
            try:
                results = self.query_regex.search(item).groupdict()
                q = results['name']
            except KeyError as e:
                e.message = 'Header regex should have a "name" field.'
                raise
            except AttributeError as e:
                e.message = 'No results from regex split; did something go '\
                            'wrong with a custom --name?'
            return q

        df = pd.DataFrame(data, columns=[k for k, _ in self.columns])
        convert_dtypes(df, dict(self.columns))
        df['full_query_name'] = df.query_name
        df['query_name'] = df.query_name.apply(split_query)
        # fix the evil coordinate system
        df.hmm_coord_from = df.hmm_coord_from - 1
        df.ali_coord_from = df.ali_coord_from - 1
        df.env_coord_from = df.env_coord_from - 1
        return df