Source code for dammit.fileio.gff3

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

import csv
from itertools import count
import pandas as pd
import sys
import warnings

from dammit.fileio.base import convert_dtypes, ChunkParser, EmptyFile, warn_empty
from dammit.utils import touch

gff_version = '3.2.1'

[docs]def id_gen_wrapper(): ID_GEN = count() def get_next(): return next(ID_GEN) return get_next
next_ID = id_gen_wrapper()
[docs]class GFF3Parser(ChunkParser): columns = [('seqid', str), ('source', str), ('type', str), ('start', int), ('end', int), ('score', float), ('strand', str), ('phase', float), ('attributes', str)] def __init__(self, filename, **kwargs): super(GFF3Parser, self).__init__(filename, **kwargs)
[docs] @staticmethod def decompose_attr_column(col): d = {} for item in col.strip(';').split(';'): key, _, val = item.strip().partition('=') d[key] = val.strip('') return d
[docs] def empty(self): df = super(GFF3Parser, self).empty() df['ID'] = None df['Name'] = None df['Target'] = None return df
def __iter__(self): '''Yields DataFrames of length chunksize from a given GTF/GFF file. GFF3 uses a 1-based, fully closed interval. Truly the devil's format. We convert to proper 0-based, half-open intervals. Yields: DataFrame: Pandas DataFrame with the results. ''' # Read everything into a DataFrame n_chunks = 0 for group in pd.read_table(self.filename, delimiter='\t', comment='#', names=[k for k,_ in self.columns], na_values='.', converters={'attributes': self.decompose_attr_column}, chunksize=self.chunksize, header=None, dtype=dict(self.columns)): # Generate a new DataFrame from the attributes dicts, and merge it in group.reset_index(drop=True, inplace=True) df = pd.merge(group, pd.DataFrame(list(group.attributes)), left_index=True, right_index=True) del df['attributes'] # Repent, repent! df.start = df.start - 1 n_chunks += 1 yield df
[docs]def maf_to_gff3(maf_df, tag='', database='', ftype='translated_nucleotide_match'): '''Convert a MAF DataFrame to a GFF3 DataFrame ready to be written to disk. Args: maf_df (pandas.DataFrame): The MAF DataFrame. See dammit.fileio.maf.MafParser for column specs. tag (str): Extra tag to add to the source column. database (str): For the database entry in the attributes column. ftype (str): The feature type; GMOD compliant if possible. Returns: pandas.DataFrame: The GFF3 compliant DataFrame. ''' gff3_df = pd.DataFrame() gff3_df['seqid'] = maf_df['q_name'] source = '{0}.LAST'.format(tag) if tag else 'LAST' gff3_df['source'] = [source] * len(maf_df) gff3_df['type'] = [ftype] * len(maf_df) gff3_df['start'] = maf_df['q_start'] gff3_df['end'] = maf_df['q_start'] + maf_df['q_aln_len'] gff3_df['score'] = maf_df['E'] gff3_df['strand'] = maf_df['q_strand'] gff3_df['phase'] = ['.'] * len(maf_df) def build_attr(row): data = [] data.append('ID=homology:{0}'.format(next_ID())) data.append('Name={0}'.format(row.s_name)) data.append('Target={0} {1} {2} {3}'.format(row.s_name, row.s_start, row.s_start + row.s_aln_len, row.s_strand)) if database: data.append('database={0}'.format(database)) return ';'.join(data) gff3_df['attributes'] = maf_df.apply(build_attr, axis=1) GFF3Writer.mangle_coordinates(gff3_df) return gff3_df
[docs]def shmlast_to_gff3(df, database=''): return maf_to_gff3(df, tag='shmlast', database=database, ftype='conditional_reciprocal_best_LAST')
[docs]def hmmscan_to_gff3(hmmscan_df, tag='', database=''): gff3_df = pd.DataFrame() gff3_df['seqid'] = hmmscan_df['query_name'] source = '{0}.HMMER'.format(tag) if tag else 'HMMER' gff3_df['source'] = [source] * len(hmmscan_df) gff3_df['type'] = ['protein_hmm_match'] * len(hmmscan_df) gff3_df['start'] = hmmscan_df['ali_coord_from'] gff3_df['end'] = hmmscan_df['ali_coord_to'] # Confirm whether this is the appropriate value to use gff3_df['score'] = hmmscan_df['domain_i_evalue'] gff3_df['strand'] = ['.'] * len(hmmscan_df) gff3_df['phase'] = ['.'] * len(hmmscan_df) def build_attr(row): data = [] data.append('ID=homology:{0}'.format(next_ID())) data.append('Name={0}'.format(row.target_name)) data.append('Target={0} {1} {2} +'.format(row.target_name, row.hmm_coord_from+1, row.hmm_coord_to)) data.append('Note={0}'.format(row.description)) data.append('accuracy={0}'.format(row.accuracy)) data.append('env_coords={0} {1}'.format(row.env_coord_from+1, row.env_coord_to)) if database: data.append('Dbxref="{0}:{1}"'.format(database, row.target_accession)) return ';'.join(data) gff3_df['attributes'] = hmmscan_df.apply(build_attr, axis=1) GFF3Writer.mangle_coordinates(gff3_df) return gff3_df
[docs]def cmscan_to_gff3(cmscan_df, tag='', database=''): gff3_df = pd.DataFrame() gff3_df['seqid'] = cmscan_df['query_name'] source = '{0}.Infernal'.format(tag) if tag else 'Infernal' gff3_df['source'] = [source] * len(cmscan_df) # For now, using: # http://www.sequenceontology.org/browser/current_svn/term/SO:0000122 # There are more specific features for secondary structure which should # be extracted from the Rfam results eventually gff3_df['type'] = ['RNA_sequence_secondary_structure'] * len(cmscan_df) gff3_df['start'] = cmscan_df['seq_from'] gff3_df['end'] = cmscan_df['seq_to'] gff3_df['score'] = cmscan_df['e_value'] gff3_df['strand'] = cmscan_df['strand'] gff3_df['phase'] = ['.'] * len(cmscan_df) def build_attr(row): data = [] data.append('ID=homology:{0}'.format(next_ID())) data.append('Name={0}'.format(row.target_name)) data.append('Target={0} {1} {2} +'.format(row.target_name, row.mdl_from+1, row.mdl_to)) data.append('Note={0}'.format(row.description)) if database: data.append('Dbxref="{0}:{1}"'.format(database, row.target_accession)) data.append('trunc={0}'.format(row.trunc)) data.append('bitscore={0}'.format(row.score)) return ';'.join(data) gff3_df['attributes'] = cmscan_df.apply(build_attr, axis=1) GFF3Writer.mangle_coordinates(gff3_df) return gff3_df
[docs]class GFF3Writer(object): version_line = '##gff-version 3.2.1' def __init__(self, filename=None, converter=None, **converter_kwds): self.filename = filename self.converter = converter self.converter_kwds = converter_kwds self.created = False
[docs] def convert(self, data_df): return self.converter(data_df, **self.converter_kwds)
[docs] def write(self, data_df, version_line=True): '''Write the given data to a GFF3 file, using the converter if given. Generates an empty file if given an empty DataFrame. Args: version_line (bool): If True, write the GFF3 version line at the. Note that this will cause an existing file to be overwritten, but will only be added in the first call to `write`. ''' if self.filename is None: raise ValueError('Trying to write to filename None! Give GFF3Writer' ' a filename.') if len(data_df) == 0: warn_empty('Writing out an empty GFF3 file to {0}'.format(self.filename)) touch(self.filename) return if not self.created and version_line is True: with open(self.filename, 'w') as fp: fp.write(self.version_line + '\n') with open(self.filename, 'a') as fp: self.created = True if self.converter is not None: data_df = self.convert(data_df) else: self.mangle_coordinates(data_df) data_df.to_csv(fp, sep='\t', na_rep='.', columns=[k for k, v in GFF3Parser.columns], index=False, header=False, quoting=csv.QUOTE_NONE, float_format='%.6e')
[docs] @staticmethod def mangle_coordinates(gff3_df): '''Although 1-based fully closed intervals are of the Beast, we will respect the convention in the interests of peace between worlds and compatibility. Args: gff3_df (DataFrame): The DataFrame to "fix". ''' gff3_df.start += 1