Source code for dammit.annotate

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

from collections import OrderedDict
import logging
import os
from os import path
import sys

from shmlast.app import CRBL

from dammit.handler import TaskHandler
from dammit.profile import add_profile_actions

from dammit.tasks.fastx import (get_transcriptome_stats_task,
                          get_rename_transcriptome_task)
from dammit.tasks.report import get_annotate_fasta_task
from dammit.tasks.busco import BuscoTask
from dammit.tasks.utils import get_group_task
from dammit.tasks.gff import (get_maf_gff3_task,
                        get_shmlast_gff3_task,
                        get_hmmscan_gff3_task,
                        get_cmscan_gff3_task,
                        get_gff3_merge_task,
                        get_maf_best_hits_task)
from dammit.tasks.last import LastalTask
from dammit.tasks.hmmer import HMMScanTask, get_remap_hmmer_task
from dammit.tasks.infernal import CMScanTask
from dammit.tasks.transdecoder import (TransDecoderPredictTask,
                                 TransDecoderLongOrfsTask)
from dammit import ui
from dammit import log

logger = logging.getLogger(__name__)


[docs]def get_handler(config, databases):
    '''Build the TaskHandler for the annotation pipelines. The
    handler will not have registered tasks when returned.

    Args:
        config (dict): Config dictionary, which contains the command
            line arguments and the entries from the config file.
        databases (dict): The dictionary of files from a database
            TaskHandler.

    Returns:
        handler.TaskHandler: A constructed TaskHandler.
    '''

    logger = logging.getLogger('AnnotateHandler')

    if config['output_dir'] is None:
        out_dir = path.basename(config['transcriptome'] + '.dammit')
    else:
        out_dir = config['output_dir']
    directory = path.abspath(out_dir)

    handler = TaskHandler(directory, logger, 
                          db='annotate',
                          backend=config['doit_backend'],
                          verbosity=config['verbosity'],
                          profile=config['profile'])
    log.start_logging(path.join(directory, 'dammit.log'))

    input_fn = path.join(directory, path.basename(config['transcriptome']))
    name_map_fn = input_fn + '.dammit.namemap.csv'
    handler.register_task('rename-transcriptome',
                          get_rename_transcriptome_task(path.abspath(config['transcriptome']),
                                                        input_fn,
                                                        name_map_fn,
                                                        config['name']),
                          files={'transcriptome': input_fn,
                                 'name_map': name_map_fn})
    
    return handler


[docs]def run_annotation(handler):
    '''Run the annotation pipeline from the given handler.

    Prints the appropriate output and exits of the pipeline is alredy completed.

    Args:
        handler (handler.TaskHandler): Handler with tasks for the pipeline.
    '''
    print(ui.header('Annotation', level=3))
    print(ui.header('Info', level=4))
    info = {'Doit Database': handler.dep_file,
            'Input Transcriptome': handler.files['transcriptome']}
    print(ui.listing(info))
    msg = '*All annotation tasks up-to-date.*'
    uptodate, statuses = handler.print_statuses(uptodate_msg=msg)
    if not uptodate:
        return handler.run()
    else:
        print('**Pipeline is already completed!**')
        sys.exit(0)


[docs]def build_default_pipeline(handler, config, databases):
    '''Register tasks for the default dammit pipeline.

    This is all the main tasks, without lastal uniref90 task.

    Args:
        handler (handler.TaskHandler): The task handler to register on.
        config (dict): Config dictionary, which contains the command
            line arguments and the entries from the config file.
        databases (dict): The dictionary of files from a database
            TaskHandler.
    
    Returns:
        handler.TaskHandler: The handler passed in.
    '''

    register_stats_task(handler)
    register_busco_task(handler, config, databases)
    register_transdecoder_tasks(handler, config, databases)
    register_rfam_tasks(handler, config, databases)
    register_lastal_tasks(handler, config, databases,
                        include_uniref=False)
    register_user_db_tasks(handler, config, databases)
    register_annotate_tasks(handler, config, databases)

    return handler


[docs]def build_full_pipeline(handler, config, databases):
    '''Register tasks for the full dammit pipeline (with uniref90).

    Args:
        handler (handler.TaskHandler): The task handler to register on.
        config (dict): Config dictionary, which contains the command
            line arguments and the entries from the config file.
        databases (dict): The dictionary of files from a database
            TaskHandler.
    
    Returns:
        handler.TaskHandler: The handler passed in.
    '''
    register_stats_task(handler)
    register_busco_task(handler, config, databases)
    register_transdecoder_tasks(handler, config, databases)
    register_rfam_tasks(handler, config, databases)
    register_lastal_tasks(handler, config, databases,
                        include_uniref=True)
    register_user_db_tasks(handler, config, databases)
    register_annotate_tasks(handler, config, databases)

    return handler


[docs]def build_quick_pipeline(handler, config, databases):
    '''Register tasks for the quick annotation pipeline.

    Leaves out the Pfam search (and so does not pass these hits to 
    `TransDecoder.Predict`), the Rfam search, and the lastal searches
    against OrthoDB and uniref90. Best suited for users who have built their
    own protein databases and would just like to annotate off them.

    Args:
        handler (handler.TaskHandler): The task handler to register on.
        config (dict): Config dictionary, which contains the command
            line arguments and the entries from the config file.
        databases (dict): The dictionary of files from a database
            TaskHandler.
    
    Returns:
        handler.TaskHandler: The handler passed in.
    '''

    register_stats_task(handler)
    register_busco_task(handler, config, databases)
    register_transdecoder_tasks(handler, config, databases,
                                include_hmmer=False)
    register_user_db_tasks(handler, config, databases)
    register_annotate_tasks(handler, config, databases)

    return handler


[docs]def register_stats_task(handler):
    '''Register the tasks for basic transcriptome metrics.'''

    input_fn = handler.files['transcriptome']
    stats_fn = input_fn + '.dammit.stats.json'
    handler.register_task('transcriptome-stats',
                          get_transcriptome_stats_task(input_fn,
                                                       stats_fn),
                          files={'stats': stats_fn})


[docs]def register_busco_task(handler, config, databases):
    '''Register tasks for BUSCO. Note that this expects
    a proper dammit config dictionary.'''

    input_fn = handler.files['transcriptome']
    busco_group = config['busco_group']
    busco_database = databases['BUSCO-{0}'.format(busco_group)]
    busco_basename = '{0}.{1}.busco.results'.format(input_fn, busco_group)
    busco_out_dir = 'run_{0}'.format(busco_basename)

    task = BuscoTask().task(input_fn,
                             busco_basename,
                             busco_database,
                             n_threads=config['n_threads'],
                             params=config['busco']['params'])

    handler.register_task('BUSCO-{0}'.format(busco_group), task,
                          files={'BUSCO': busco_out_dir})


[docs]def register_transdecoder_tasks(handler, config, databases,
                                include_hmmer=True):
    '''Register tasks for TransDecoder. TransDecoder first finds long ORFs with
    `TransDecoder.LongOrfs`, which are output as a FASTA file of protein
    sequences. These sequences can are then used to search against Pfam-A for
    conserved domains, and the coordinates from the resulting matches mapped
    back relative to the original transcripts. `TransDecoder.Predict` the builds
    the final gene models based on the training data provided by
    `TransDecoder.LongOrfs`, optionally using the Pfam-A results to keep
    ORFs which otherwise don't fit the model closely enough. Once again,
    note that a proper dammit config dictionary is required.
    '''

    input_fn = handler.files['transcriptome']
    transdecoder_dir = '{0}.transdecoder_dir'.format(input_fn)
    
    handler.register_task('TransDecoder.LongOrfs',
                          TransDecoderLongOrfsTask().task(input_fn, 
                                                          params=config['transdecoder']['longorfs']),
                          files={'longest_orfs': path.join(transdecoder_dir, 'longest_orfs.pep')})
    
    pfam_fn = None
    if include_hmmer is True:
        pfam_fn = path.join(transdecoder_dir, 'longest_orfs.pep.x.pfam.tbl')
        task = HMMScanTask().task(handler.files['longest_orfs'],
                                   pfam_fn,
                                   databases['Pfam-A'],
                                   cutoff=config['evalue'],
                                   n_threads=config['n_threads'],
                                   sshloginfile=config['sshloginfile'],
                                   params=config['hmmer']['hmmscan'])
        handler.register_task('hmmscan:Pfam-A', task,
                              files={'longest_orfs_pfam': pfam_fn})

        pfam_csv_fn = '{0}.x.pfam-A.csv'.format(input_fn)
        handler.register_task('hmmscan:Pfam-A:remap',
                              get_remap_hmmer_task(handler.files['longest_orfs_pfam'],
                                                   path.join(transdecoder_dir, 'longest_orfs.gff3'),
                                                   pfam_csv_fn,
                                                   transcript_basename=config['name']),
                              files={'Pfam-A-csv': pfam_csv_fn})
        
        pfam_gff3 = '{0}.x.pfam-A.gff3'.format(input_fn)
        handler.register_task('gff3:Pfam-A',
                               get_hmmscan_gff3_task(pfam_csv_fn,
                                                     pfam_gff3, 
                                                     'Pfam'),
                               files={'Pfam-A-gff3': pfam_gff3})

    predict_params = config['transdecoder']['predict']
    transdecoder_pep = '{0}.transdecoder.pep'.format(input_fn)
    transdecoder_gff3 = '{0}.transdecoder.gff3'.format(input_fn)
    handler.register_task('TransDecoder.Predict',
                          TransDecoderPredictTask().task(input_fn, 
                                                         pfam_filename=pfam_fn,
                                                         params=predict_params),
                          files={'transdecoder-pep': transdecoder_pep,
                                 'transdecoder-gff3': transdecoder_gff3})


[docs]def register_rfam_tasks(handler, config, databases):
    '''Registers tasks for Infernal's `cmscan` against Rfam. Rfam is an RNA
    secondary structure database comprising covariance models for many known
    RNAs. This is a relatively slow step. A proper dammit config dictionary is
    required.'''

    input_fn = handler.files['transcriptome']
    output_fn = '{0}.x.rfam.tbl'.format(input_fn)
    handler.register_task('cmscan:Rfam',
                          CMScanTask().task(input_fn,
                                            output_fn,
                                            databases['Rfam'],
                                            cutoff=config['evalue'],
                                            n_threads=config['n_threads'],
                                            sshloginfile=config['sshloginfile'],
                                            params=config['infernal']['cmscan']),
                          files={'Rfam': output_fn})

    rfam_gff3 = '{0}.x.rfam.gff3'.format(input_fn)
    handler.register_task('gff3:Rfam',
                          get_cmscan_gff3_task(output_fn,
                                               rfam_gff3, 
                                               'Rfam'),
                          files={'Rfam-gff3': rfam_gff3})


[docs]def register_lastal_tasks(handler, config, databases,
                          include_uniref=False):
    '''Register tasks for `lastal` searches. By default, this will just
    align the transcriptome against OrthoDB; if requested, it will align against
    uniref90 as well, which takes considerably longer.

    Args:
        handler (handler.TaskHandler): The task handler to register on.
        config (dict): Config dictionary, which contains the command
            line arguments and the entries from the config file.
        databases (dict): The dictionary of files from a database
            TaskHandler.
        include_uniref (bool): If True, add tasks for searching uniref90.
    '''

    input_fn = handler.files['transcriptome']
    lastal_cfg = config['last']['lastal']
    
    dbs = OrderedDict()
    dbs['OrthoDB'] = databases['OrthoDB']
    if include_uniref is True:
        dbs['uniref90'] = databases['uniref90']

    for name, db in dbs.items():
        output_fn = '{0}.x.{1}.maf'.format(input_fn, name)
        handler.register_task('lastal:{0}'.format(name),
                          add_profile_actions(
                              LastalTask().task(input_fn,
                                              db,
                                              output_fn,
                                              translate=True,
                                              cutoff=config['evalue'],
                                              n_threads=config['n_threads'],
                                              frameshift=lastal_cfg['frameshift'],
                                              pbs=config['sshloginfile'],
                                              params=lastal_cfg['params'])),
                              files={name: output_fn})

        best_fn = '{0}.x.{1}.best.csv'.format(input_fn, name)
        gff3_fn = '{0}.x.{1}.best.gff3'.format(input_fn, name)

        handler.register_task('lastal:best-hits:{0}'.format(name),
                              get_maf_best_hits_task(output_fn,
                                                     best_fn),
                              files={'{0}-best-hits'.format(name): best_fn})
        handler.register_task('gff3:{0}'.format(name),
                              get_maf_gff3_task(best_fn,
                                                gff3_fn,
                                                name),
                              files={'{0}-gff3'.format(name): gff3_fn})


[docs]def register_annotate_tasks(handler, config, databases):
    '''Register tasks for aggregating the annotations into one GFF3 file
    and writing out summary descriptions in a new FASTA file.

    Args:
        handler (handler.TaskHandler): The task handler to register on.
        config (dict): Config dictionary, which contains the command
            line arguments and the entries from the config file.
        databases (dict): The dictionary of files from a database
            TaskHandler.
    '''
    input_fn = handler.files['transcriptome']
    gff3_files = [fn for name, fn in handler.files.items() if name.endswith('-gff3')]
    merged_gff3 = '{0}.dammit.gff3'.format(input_fn)
    handler.register_task('gff3:merge-all',
                          get_gff3_merge_task(gff3_files, merged_gff3),
                          files={'merged-gff3': merged_gff3})

    annotated_fn = '{0}.dammit.fasta'.format(input_fn)
    handler.register_task('annotate:fasta',
                          get_annotate_fasta_task(input_fn,
                                                  merged_gff3,
                                                  annotated_fn),
                          files={'annotated-fasta': annotated_fn})


[docs]def register_user_db_tasks(handler, config, databases):
    '''Run conditional recipricol best hits LAST (CRBL) against the
    user-supplied databases.
    '''
    
    if not 'user_databases' in config:
        return

    input_fn = handler.files['transcriptome']
    for db_path in config['user_databases']:
        db_path = path.abspath(db_path)
        db_basename = path.basename(db_path)

        results_fn = '{0}.x.{1}.crbl.csv'.format(input_fn, db_basename)
        gff3_fn = '{0}.x.{1}.crbl.gff3'.format(input_fn, db_basename)

        crbl = CRBL(input_fn,
                    db_path, 
                    results_fn, 
                    n_threads=config['n_threads'],
                    cutoff=config['evalue'])
        for task in crbl.tasks():
            task.name = 'user-database:{0}-shmlast-{1}'.format(db_basename,
                                                               task.name)
            handler.register_task(task.name, add_profile_actions(task))
        handler.register_task('gff3:{0}'.format(results_fn),
                              get_shmlast_gff3_task(results_fn,
                                                    gff3_fn,
                                                    db_basename),
                              files={'{0}-crbl-gff3'.format(db_basename): gff3_fn})
        handler.files['{0}-crbl'.format(db_basename)] = results_fn