Source code for dammit.tasks.report
# Copyright (C) 2015-2018 Camille Scott
# All rights reserved.
#
# This software may be modified and distributed under the terms
# of the BSD license. See the LICENSE file for details.
import os
import sys
from doit.tools import run_once
from doit.task import clean_targets
from khmer import ReadParser
from dammit.fileio.gff3 import GFF3Parser
from dammit.profile import profile_task
from dammit.utils import doit_task
[docs]def generate_sequence_name(original_name, sequence, annotation_df):
pass
[docs]def generate_sequence_summary(original_name, sequence, annotation_df):
'''Given a FASTA sequence's original name, the sequence itself,
and a DataFrame with its corresponding GFF3 annotations, generate
a summary line of the annotations in key=value format.
Args:
original_name (str): Original name of the sequence.
sequence (str): The sequence itself.
annotation_df (DataFrame): DataFrame with GFF3 format annotations.
Returns:
str: The new summary header.
'''
annots = ['len={0}'.format(len(sequence))]
for feature_type, fgroup in annotation_df.groupby('type'):
if feature_type in ['translated_nucleotide_match',
'protein_hmm_match',
'RNA_sequence_secondary_structure']:
collapsed = ','.join(['{}:{}-{}'.format(row.Name.split(':dammit')[0],
int(row.start),
int(row.end)) \
for _, row in fgroup.iterrows()])
if feature_type == 'translated_nucleotide_match':
key = 'homologies'
elif feature_type == 'protein_hmm_match':
key = 'hmm_matches'
else:
key = 'RNA_matches'
annots.append('{0}={1}'.format(key, collapsed))
elif feature_type in ['exon', 'CDS', 'gene',
'five_prime_UTR', 'three_prime_UTR',
'mRNA']:
collapsed = ','.join(['{}-{}'.format(int(row.start),
int(row.end)) \
for _, row in fgroup.iterrows()])
annots.append('{0}={1}'.format(feature_type, collapsed))
desc = '{0} {1}'.format(original_name, ' '.join(annots))
return desc
[docs]@doit_task
@profile_task
def get_annotate_fasta_task(transcriptome_fn, gff3_fn, output_fn):
'''Annotation the headers in a FASTA file with its corresponding GFF3 file.
Args:
transcriptome_fn (str): Path to the FASTA file.
gff3_fn (str): Path to the GFF3 annotations.
output_fn (str): Path to store the resulting annotated FASTA.
Returns:
dict: A doit task.
'''
name = 'fasta-annotate:{0}'.format(output_fn)
def annotate_fasta():
annotations = GFF3Parser(gff3_fn).read()
with open(output_fn, 'w') as fp:
for n, record in enumerate(ReadParser(transcriptome_fn)):
df = annotations.query('seqid == "{0}"'.format(record.name))
desc = generate_sequence_summary(record.name, record.sequence,
df)
fp.write('>{0}\n{1}\n'.format(desc.strip(), record.sequence))
return {'name': name,
'actions': [annotate_fasta],
'file_dep': [transcriptome_fn, gff3_fn],
'targets': [output_fn],
'clean': [clean_targets]}