Source code for dammit.tasks.fastx

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

from itertools import count
import json
import os
import re

from doit.tools import run_once, create_folder, LongRunning
from doit.task import clean_targets, dict_to_task
from khmer import HLLCounter, ReadParser
import pandas as pd

from dammit.fileio.gff3 import GFF3Parser
from dammit.profile import profile_task
from dammit.utils import which, doit_task


seq_ext = re.compile(r'(.fasta)|(.fa)|(.fastq)|(.fq)')
[docs]def strip_seq_extension(fn): return seq_ext.split(fn)[0]
[docs]@doit_task def get_rename_transcriptome_task(transcriptome_fn, output_fn, names_fn, transcript_basename, split_regex=None): '''Create a doit task to copy a FASTA file and rename the headers. Args: transcriptome_fn (str): The FASTA file. output_fn (str): Destination to copy to. names_fn (str): Destination to the store mapping from old to new names. transcript_basename (str): String to contruct new names from. split_regex (regex): Regex to split the input names with; must contain a `name` field. Returns: dict: A doit task. ''' import re name = os.path.basename(transcriptome_fn) if split_regex is None: counter = count() header_func = lambda name: '{0}_{1}'.format(transcript_basename, next(counter)) else: def header_func(header): results = re.search(split_regex, header).groupdict() try: header = results['name'] except KeyError as err: err.message = 'Header regex should have a name field!' raise return header def fix(): names = [] with open(output_fn, 'w') as fp: for record in ReadParser(transcriptome_fn): header = header_func(record.name) fp.write('>{0}\n{1}\n'.format(header, record.sequence)) names.append((record.name, header)) pd.DataFrame(names, columns=['original', 'renamed']).to_csv(names_fn, index=False) return {'name': name, 'actions': [fix], 'targets': [output_fn, names_fn], 'file_dep': [transcriptome_fn], 'clean': [clean_targets]}
[docs]@doit_task @profile_task def get_transcriptome_stats_task(transcriptome, output_fn): '''Create a doit task to run basic metrics on a transcriptome. Args: transcriptome (str): The input FASTA file. output_fn (str): File to store the results. Returns: dict: A doit task. ''' import re name = 'transcriptome_stats:' + os.path.basename(transcriptome) K = 25 DNA = re.compile(r'[ACGTN]*$', flags=re.IGNORECASE) def parse(fn): hll = HLLCounter(.01, K) lens = [] names = [] gc_len = 0 n_ambiguous = 0 for contig in ReadParser(fn): sequence = contig.sequence lens.append(len(sequence)) names.append(contig.name) if DNA.match(sequence) is None: raise RuntimeError('non-ACGTN characters not supported. '\ 'Offending transcript: \n>{0}\n{1}\nbad'\ .format(contig.name, contig.sequence)) if 'N' in sequence: sequence = sequence.replace('N', 'A') n_ambiguous += 1 hll.consume_string(sequence) gc_len += contig.sequence.count('C') gc_len += contig.sequence.count('G') S = pd.Series(lens, index=names) try: S.sort_values() except AttributeError: S.sort() gc_perc = float(gc_len) / S.sum() return S, hll.estimate_cardinality(), gc_perc, n_ambiguous def calc_NX(lens, X): N = lens.sum() threshold = (float(X) / 100.0) * N NXlen = 0 NXpos = 0 cms = 0 for n, l in enumerate(lens): cms += l if cms >= threshold: NXlen = l NXpos = n break return NXlen, NXpos def cmd(): lens, uniq_kmers, gc_perc, n_amb = parse(transcriptome) exp_kmers = (lens - (K+1)).sum() redundancy = float(exp_kmers - uniq_kmers) / exp_kmers if redundancy < 0: redundancy = 0.0 N50len, N50pos = calc_NX(lens, 50) stats = {'N': len(lens), 'sum': int(lens.sum()), 'min': int(lens.min()), 'max': int(lens.max()), 'med': int(lens.median()), 'mean': float(lens.mean()), 'N50len': int(N50len), 'N50pos': int(N50pos), '25_mers': int(exp_kmers), '25_mers_unique': uniq_kmers, 'n_ambiguous': n_amb, 'redundancy': redundancy, 'GCperc': float(gc_perc)} with open(output_fn, 'w') as fp: json.dump(stats, fp, indent=4) with open(output_fn, 'r') as fp: print(fp.read()) return {'name': name, 'actions': [(cmd, [])], 'file_dep': [transcriptome], 'targets': [output_fn], 'clean': [clean_targets]}