# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license. See the LICENSE file for details.
import os
from doit.action import CmdAction
from doit.tools import run_once
from doit.task import clean_targets
import pandas as pd
from dammit import meta
from dammit.profile import profile_task
from dammit.utils import doit_task, which
from dammit.tasks.utils import clean_folder, DependentTask, InstallationError
[docs]class BuscoTask(DependentTask):
[docs] def deps(self):
buscov2 = which('BUSCO.py')
buscov3 = which('run_BUSCO.py')
tblastn = which('tblastn')
makeblastdb = which('makeblastdb')
if buscov2 is None and buscov3 is None:
raise InstallationError('BUSCO not found. NOTE: '\
'dammit 1.0 requires BUSCO v2 or greater')
if tblastn is None:
raise InstallationError('tblastn not found, required for BUSCO.')
if makeblastdb is None:
raise InstallationError('makeblastdb not found, required for BUSCO.')
if self.logger:
logger.debug('BUSCO:' + busco)
return buscov3 if buscov3 is not None else buscov2
[docs] @doit_task
@profile_task
def task(self, input_filename, output_name, busco_db_dir,
input_type='tran', n_threads=1, config_file=None,
params=None):
'''Get a task to run BUSCO on the given FASTA file.
Args:
input_filename (str): The FASTA file to run BUSCO on.
output_name (str): Base name for the BUSCO output directory.
busco_db_dir (str): Directory with the BUSCO databases.
input_type (str): By default, `trans` for transcriptome.
n_threads (int): Number of threads to use.
params (list): Extra parameters to pass to the executable.
Returns:
dict: A doit task.
'''
name = 'busco:{0}-{1}'.format(os.path.basename(input_filename),
os.path.basename(busco_db_dir))
exc = self.deps()
# BUSCO chokes on file paths as output names
output_name = os.path.basename(output_name)
cmd = []
if config_file is not None:
cmd.append('BUSCO_CONFIG_FILE="{0}"'.format(config_file))
cmd.extend(['python3', exc, '-i', input_filename, '-f', '-o', output_name,
'-l', busco_db_dir, '-m', input_type, '-c', str(n_threads)])
if params is not None:
cmd.extend(params)
cmd = ' '.join(cmd)
output_folder = 'run_' + output_name
target_fn = os.path.join(output_folder, 'full_table_{0}.tsv'.format(output_name))
return {'name': name,
'actions': [cmd],
'file_dep': [input_filename],
'targets': [target_fn],
'clean': [(clean_folder, [output_folder])]}
[docs]def parse_busco_full(fn):
'''Parses a BUSCO full result table into a Pandas DataFrame.
Args:
fn (str): The results file.
Returns:
DataFrame: The results DataFrame.
'''
df = pd.read_table(fn)
return df.rename(columns={'#BUSCO_group': 'BUSCO_group'})
[docs]def parse_busco_summary(fn):
'''Parses a BUSCO summary file into a JSON compatible
dictionary.
Args:
fn (str): The summary results file.
Returns:
dict: The BUSCO results.
'''
res = {}
with open(fn) as fp:
for ln in fp:
if ln.strip().startswith('C:'):
tokens = ln.split(',')
for token in tokens:
key, _, val = token.partition(':')
key = key.strip()
val = val.strip().strip('%')
if key == 'C':
valc, _, vald = val.partition('%')
valc = valc.strip()
vald = vald.strip('D:][%')
res['C(%)'] = valc
res['D(%)'] = vald
else:
if key != 'n':
key += '(%)'
res[key] = val.strip().strip('%')
return res
[docs]def parse_busco_multiple(fn_list, dbs=['metazoa', 'vertebrata']):
'''Parses multiple BUSCO results summaries into an appropriately
index DataFrame.
Args:
fn_list (list): List of paths to results files.
dbs (list): List of BUSCO database names.
Returns:
DataFrame: The formated DataFrame.
'''
data = []
for fn in fn_list:
data.append(parse_busco_summary(fn))
df = pd.DataFrame(data)
df['fn'] = [os.path.basename(fn)[14:-14].strip('.') for fn in fn_list]
df['db'] = None
for db in dbs:
idx = df.fn.str.contains(db)
df.loc[idx,'db'] = db
df.loc[idx,'fn'] = df.loc[idx, 'fn'].apply(lambda fn: fn[:fn.find(db)].strip('. '))
return df
[docs]def busco_to_df(fn_list, dbs=['metazoa', 'vertebrata']):
''' Given a list of BUSCO results from different databases, produce
an appropriately multi-indexed DataFrame of the results.
Args:
fn_list (list): The BUSCO summary files.
dbs (list): The BUSCO databases used for these runs.
Returns:
DataFrame: The BUSCO results.
'''
data = []
for fn in fn_list:
data.append(parse_busco(fn))
df = pd.DataFrame(data)
df['fn'] = [os.path.basename(fn)[14:-14].strip('.') for fn in fn_list]
df['db'] = None
for db in dbs:
idx = df.fn.str.contains(db)
df.loc[idx,'db'] = db
df.loc[idx,'fn'] = df.loc[idx, 'fn'].apply(lambda fn: fn[:fn.find(db)].strip('. '))
return df