Source code for dammit.tasks.busco

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

import os

from doit.action import CmdAction
from doit.tools import run_once
from doit.task import clean_targets
import pandas as pd

from dammit import meta
from dammit.profile import profile_task
from dammit.utils import doit_task, which
from dammit.tasks.utils import clean_folder, DependentTask, InstallationError


[docs]class BuscoTask(DependentTask):
[docs] def deps(self): buscov2 = which('BUSCO.py') buscov3 = which('run_BUSCO.py') tblastn = which('tblastn') makeblastdb = which('makeblastdb') if buscov2 is None and buscov3 is None: raise InstallationError('BUSCO not found. NOTE: '\ 'dammit 1.0 requires BUSCO v2 or greater') if tblastn is None: raise InstallationError('tblastn not found, required for BUSCO.') if makeblastdb is None: raise InstallationError('makeblastdb not found, required for BUSCO.') if self.logger: logger.debug('BUSCO:' + busco) return buscov3 if buscov3 is not None else buscov2
[docs] @doit_task @profile_task def task(self, input_filename, output_name, busco_db_dir, input_type='tran', n_threads=1, config_file=None, params=None): '''Get a task to run BUSCO on the given FASTA file. Args: input_filename (str): The FASTA file to run BUSCO on. output_name (str): Base name for the BUSCO output directory. busco_db_dir (str): Directory with the BUSCO databases. input_type (str): By default, `trans` for transcriptome. n_threads (int): Number of threads to use. params (list): Extra parameters to pass to the executable. Returns: dict: A doit task. ''' name = 'busco:{0}-{1}'.format(os.path.basename(input_filename), os.path.basename(busco_db_dir)) exc = self.deps() # BUSCO chokes on file paths as output names output_name = os.path.basename(output_name) cmd = [] if config_file is not None: cmd.append('BUSCO_CONFIG_FILE="{0}"'.format(config_file)) cmd.extend(['python3', exc, '-i', input_filename, '-f', '-o', output_name, '-l', busco_db_dir, '-m', input_type, '-c', str(n_threads)]) if params is not None: cmd.extend(params) cmd = ' '.join(cmd) output_folder = 'run_' + output_name target_fn = os.path.join(output_folder, 'full_table_{0}.tsv'.format(output_name)) return {'name': name, 'actions': [cmd], 'file_dep': [input_filename], 'targets': [target_fn], 'clean': [(clean_folder, [output_folder])]}
[docs]def parse_busco_full(fn): '''Parses a BUSCO full result table into a Pandas DataFrame. Args: fn (str): The results file. Returns: DataFrame: The results DataFrame. ''' df = pd.read_table(fn) return df.rename(columns={'#BUSCO_group': 'BUSCO_group'})
[docs]def parse_busco_summary(fn): '''Parses a BUSCO summary file into a JSON compatible dictionary. Args: fn (str): The summary results file. Returns: dict: The BUSCO results. ''' res = {} with open(fn) as fp: for ln in fp: if ln.strip().startswith('C:'): tokens = ln.split(',') for token in tokens: key, _, val = token.partition(':') key = key.strip() val = val.strip().strip('%') if key == 'C': valc, _, vald = val.partition('%') valc = valc.strip() vald = vald.strip('D:][%') res['C(%)'] = valc res['D(%)'] = vald else: if key != 'n': key += '(%)' res[key] = val.strip().strip('%') return res
[docs]def parse_busco_multiple(fn_list, dbs=['metazoa', 'vertebrata']): '''Parses multiple BUSCO results summaries into an appropriately index DataFrame. Args: fn_list (list): List of paths to results files. dbs (list): List of BUSCO database names. Returns: DataFrame: The formated DataFrame. ''' data = [] for fn in fn_list: data.append(parse_busco_summary(fn)) df = pd.DataFrame(data) df['fn'] = [os.path.basename(fn)[14:-14].strip('.') for fn in fn_list] df['db'] = None for db in dbs: idx = df.fn.str.contains(db) df.loc[idx,'db'] = db df.loc[idx,'fn'] = df.loc[idx, 'fn'].apply(lambda fn: fn[:fn.find(db)].strip('. ')) return df
[docs]def busco_to_df(fn_list, dbs=['metazoa', 'vertebrata']): ''' Given a list of BUSCO results from different databases, produce an appropriately multi-indexed DataFrame of the results. Args: fn_list (list): The BUSCO summary files. dbs (list): The BUSCO databases used for these runs. Returns: DataFrame: The BUSCO results. ''' data = [] for fn in fn_list: data.append(parse_busco(fn)) df = pd.DataFrame(data) df['fn'] = [os.path.basename(fn)[14:-14].strip('.') for fn in fn_list] df['db'] = None for db in dbs: idx = df.fn.str.contains(db) df.loc[idx,'db'] = db df.loc[idx,'fn'] = df.loc[idx, 'fn'].apply(lambda fn: fn[:fn.find(db)].strip('. ')) return df