Source code for dammit.tasks.report

# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license.  See the LICENSE file for details.

import os
import sys

from doit.tools import run_once
from doit.task import clean_targets
from khmer import ReadParser

from dammit.fileio.gff3 import GFF3Parser
from dammit.profile import profile_task
from dammit.utils import doit_task


[docs]def generate_sequence_name(original_name, sequence, annotation_df):
    pass


[docs]def generate_sequence_summary(original_name, sequence, annotation_df):
    '''Given a FASTA sequence's original name, the sequence itself,
    and a DataFrame with its corresponding GFF3 annotations, generate
    a summary line of the annotations in key=value format.

    Args:
        original_name (str): Original name of the sequence.
        sequence (str): The sequence itself.
        annotation_df (DataFrame): DataFrame with GFF3 format annotations.

    Returns:
        str: The new summary header.
    '''

    annots = ['len={0}'.format(len(sequence))]
    for feature_type, fgroup in annotation_df.groupby('type'):

        if feature_type in ['translated_nucleotide_match',
                            'protein_hmm_match',
                            'RNA_sequence_secondary_structure']:

            collapsed = ','.join(['{}:{}-{}'.format(row.Name.split(':dammit')[0],
                                                     int(row.start),
                                                     int(row.end)) \
                            for _, row in fgroup.iterrows()])
            if feature_type == 'translated_nucleotide_match':
                key = 'homologies'
            elif feature_type == 'protein_hmm_match':
                key = 'hmm_matches'
            else:
                key = 'RNA_matches'
            annots.append('{0}={1}'.format(key, collapsed))

        elif feature_type in ['exon', 'CDS', 'gene',
                              'five_prime_UTR', 'three_prime_UTR',
                              'mRNA']:

            collapsed = ','.join(['{}-{}'.format(int(row.start),
                                                 int(row.end)) \
                            for _, row in fgroup.iterrows()])
            annots.append('{0}={1}'.format(feature_type, collapsed))

    desc = '{0} {1}'.format(original_name, ' '.join(annots))
    return desc


[docs]@doit_task
@profile_task
def get_annotate_fasta_task(transcriptome_fn, gff3_fn, output_fn):
    '''Annotation the headers in a FASTA file with its corresponding GFF3 file.

    Args:
        transcriptome_fn (str): Path to the FASTA file.
        gff3_fn (str): Path to the GFF3 annotations.
        output_fn (str): Path to store the resulting annotated FASTA.

    Returns:
        dict: A doit task.
    '''

    name = 'fasta-annotate:{0}'.format(output_fn)

    def annotate_fasta():
        annotations = GFF3Parser(gff3_fn).read()
        with open(output_fn, 'w') as fp:
            for n, record in enumerate(ReadParser(transcriptome_fn)):
                df = annotations.query('seqid == "{0}"'.format(record.name))
                desc = generate_sequence_summary(record.name, record.sequence,
                                                 df)
                fp.write('>{0}\n{1}\n'.format(desc.strip(), record.sequence))

    return {'name': name,
            'actions': [annotate_fasta],
            'file_dep': [transcriptome_fn, gff3_fn],
            'targets': [output_fn],
            'clean': [clean_targets]}