Source code for dammit.tasks.gff

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

import os

from doit.tools import run_once, create_folder, LongRunning
from doit.task import clean_targets, dict_to_task
import pandas as pd
from shmlast import hits

from dammit.utils import which, doit_task, touch
from dammit.fileio import EmptyFile
from dammit.fileio.maf import MafParser
from dammit.fileio.infernal import InfernalParser
from dammit.fileio.hmmer import HMMerParser
from dammit.fileio.gff3 import (GFF3Writer, maf_to_gff3, shmlast_to_gff3,
                           hmmscan_to_gff3, cmscan_to_gff3)


[docs]@doit_task def get_maf_best_hits_task(maf_fn, output_fn): '''Doit task to get the best hits from a lastal MAF file. Args: maf_fn (str): Path to the MAF file. output_fn (str): Path to store resulting CSV file. Returns: dict: A doit task. ''' hits_mgr = hits.BestHits() def cmd(): # can write out an empty file df = MafParser(maf_fn).read() df = hits_mgr.best_hits(df) df.to_csv(output_fn, index=False) name = 'maf_best_hits:{0}-{1}'.format(maf_fn, output_fn) return {'name': name, 'actions': [cmd], 'targets': [output_fn], 'file_dep': [maf_fn], 'clean': [clean_targets]}
[docs]@doit_task def get_maf_gff3_task(input_filename, output_filename, database): '''Given either a raw MAF file or a CSV file with the proper MAF colums, convert it to GFF3 and save the results. Args: input_filename (str): The input MAF or CSV. output_filename (str): Destination for GFF3 output. database (str): Tag to use in the GFF3 `Dbxref` field. Returns: dict: A doit task. ''' name = 'maf-gff3:' + os.path.basename(output_filename) def cmd(): if input_filename.endswith('.csv') or input_filename.endswith('.tsv'): it = pd.read_csv(input_filename, chunksize=10000) else: it = MafParser(input_filename) writer = GFF3Writer(output_filename, converter=maf_to_gff3, database=database) try: for group in it: writer.write(group) except EmptyFile: touch(output_filename) return {'name': name, 'actions': ['rm -f {0}'.format(output_filename), cmd], 'file_dep': [input_filename], 'targets': [output_filename], 'clean': [clean_targets]}
[docs]@doit_task def get_shmlast_gff3_task(input_filename, output_filename, database): '''Given the CSV output from shmlast, convert it to GFF3 and save the results. Args: input_filename (str): The input CSV. output_filename (str): Destination for GFF3 output. database (str): Tag to use in the GFF3 `Dbxref` field. Returns: dict: A doit task. ''' name = 'shmlast-gff3:' + os.path.basename(output_filename) def cmd(): it = pd.read_csv(input_filename, chunksize=10000) writer = GFF3Writer(output_filename, converter=shmlast_to_gff3, database=database) try: for group in it: writer.write(group) except EmptyFile: touch(output_filename) return {'name': name, 'actions': ['rm -f {0}'.format(output_filename), cmd], 'file_dep': [input_filename], 'targets': [output_filename], 'clean': [clean_targets]}
[docs]@doit_task def get_hmmscan_gff3_task(input_filename, output_filename, database): '''Given HMMER output converted to CSV, convert it to GFF3 and save the results. CSV generated from the DataFrame(s) returned by the HMMerParser. Args: input_filename (str): The input CSV. output_filename (str): Destination for GFF3 output. database (str): Tag to use in the GFF3 `Dbxref` field. Returns: dict: A doit task. ''' name = 'hmmscan-gff3:' + os.path.basename(output_filename) def cmd(): writer = GFF3Writer(output_filename, converter=hmmscan_to_gff3, database=database) try: for group in pd.read_csv(input_filename, chunksize=10000): writer.write(group) except EmptyFile as e: touch(output_filename) return {'name': name, 'actions': ['rm -f {0}'.format(output_filename), cmd], 'file_dep': [input_filename], 'targets': [output_filename], 'clean': [clean_targets]}
[docs]@doit_task def get_cmscan_gff3_task(input_filename, output_filename, database): '''Given raw input from Infernal's cmscan, convert it to GFF3 and save the results. Args: input_filename (str): The input CSV. output_filename (str): Destination for GFF3 output. database (str): Tag to use in the GFF3 `Dbxref` field. Returns: dict: A doit task. ''' name = 'cmscan-gff3:' + os.path.basename(output_filename) def cmd(): writer = GFF3Writer(output_filename, converter=cmscan_to_gff3, database=database) try: for group in InfernalParser(input_filename): writer.write(group) except EmptyFile as e: touch(output_filename) return {'name': name, 'actions': ['rm -f {0}'.format(output_filename), cmd], 'file_dep': [input_filename], 'targets': [output_filename], 'clean': [clean_targets]}
[docs]@doit_task def get_gff3_merge_task(gff3_filenames, output_filename): '''Given a list of GFF3 files, merge them all together. Args: gff3_filenames (list): Paths to the GFF3 files. output_filename (str): Path to pipe the results. Returns: dict: A doit task. ''' name = 'gff3-merge:{0}'.format(os.path.basename(output_filename)) merge_cmd = 'echo "{v}" > {out}; cat {f} | sed \'/^#/ d\''\ ' | sort | sed \'/^$/d\' >> {out}'.format(v=GFF3Writer.version_line, f=' '.join(gff3_filenames), out=output_filename) return {'name': name, 'actions': [merge_cmd], 'file_dep': gff3_filenames, 'targets': [output_filename], 'clean': [clean_targets]}