Source code for dammit.fileio.infernal

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

import pandas as pd
from dammit.fileio.base import convert_dtypes, ChunkParser


[docs]class InfernalParser(ChunkParser): columns = [('target_name', str), ('target_accession', str), ('query_name', str), ('query_accession', str), ('mdl', str), ('mdl_from', int), ('mdl_to', int), ('seq_from', int), ('seq_to', int), ('strand', str), ('trunc', str), ('pass', str), ('gc', float), ('bias', float), ('score', float), ('e_value', float), ('inc', str), ('description', str)] def __init__(self, filename, **kwargs): super(InfernalParser, self).__init__(filename, **kwargs) def __iter__(self): '''Yields DataFrames of length chunksize from a given cmscan result file. The format uses 1-based, fully open intervals; when the strand is negative, the start coordinate is larger than the end. Truly Infernal. We convert to proper 0-based, half-open, ordered intervals. Yields: DataFrame: Pandas DataFrame with the cmscan hits. ''' data = [] n_entries = 0 with open(self.filename) as fp: for ln in fp: ln = ln.strip() if not ln or ln.startswith('#'): continue tokens = ln.split() data.append(tokens[:len(self.columns)-1] + \ [' '.join(tokens[len(self.columns)-1:])]) n_entries += 1 if len(data) >= self.chunksize: yield self._build_df(data) data = [] if n_entries == 0: self.raise_empty() if data: yield self._build_df(data) def _build_df(self, data): if not data: self.raise_empty() df = pd.DataFrame(data, columns=[k for k, _ in self.columns]) convert_dtypes(df, dict(self.columns)) # fix the evil coordinate system sidx = df.seq_from > df.seq_to df.loc[sidx, 'seq_from'], df.loc[sidx, 'seq_to'] = \ df.loc[sidx, 'seq_to'], df.loc[sidx, 'seq_from'] df.mdl_from = df.mdl_from - 1 df.seq_from = df.seq_from - 1 return df