Source code for tral.sequence.sequence

# (C) 2015 Elke Schaper

"""
    :synopsis: The Sequence Class.

    .. moduleauthor:: Elke Schaper <elke.schaper@isb-sib.ch>
"""

import logging
import pickle
import re

from tral import configuration
from tral.repeat import repeat, repeat_align
from tral.repeat_list import repeat_list
from tral.hmm import hmm, hmm_viterbi
from tral.sequence import repeat_detection_run, sequence_io

CONFIG = configuration.Configuration.instance().config
REPEAT_CONFIG = CONFIG["repeat"]
LOG = logging.getLogger(__name__)


[docs]class Sequence: """ A ``Sequence`` describes either a protein or a DNA sequence. ``Sequence`` contains methods that act on single sequences, for example: * tandem repeats Attributes: seq (str): The sequence. seq_standard_aa (str): The sequence with standard amino acids only """ def __init__(self, seq, name=None, sequence_type="AA"): if not isinstance(seq, str): raise Exception('The seq value is not a String') self.seq = seq.upper() self.sequence_type = sequence_type for i in self.seq: if i not in CONFIG[self.sequence_type]['all_chars']: raise Exception("{} is not in CONFIG[{}]['all_chars']: {}" .format(i, self.sequence_type, CONFIG['all_chars'])) self.seq_standard_aa = repeat.standardize(self.seq, self.sequence_type) if name: self.name = name self.d_annotations = {} self.d_repeatlist = {}
[docs] def create(file, input_format): """ Create sequence(s) from file. Create sequence(s) from file. Args: file (str): Path to input file format (str): Either "fasta" or "pickle" .. todo:: Write checks for ``format`` and ``file``. """ if input_format == 'fasta': l_seq = sequence_io.read_fasta(file) return [Sequence(iSeq, iID) for iSeq, iID in l_seq] if input_format == 'pickle': with open(file, 'rb') as fh: return pickle.load(fh) else: raise Exception("Input format {} is not implemented for" "sequence.create()".format(input_format))
[docs] def write(self, file, file_format): """ Write sequence to file. Write sequence to file using one of two formats. Args: file (str): Path to output file format (str): Either "fasta" or "pickle" .. todo:: Write checks for ``format`` and ``file``. """ if file_format == 'fasta': sequence_io.write(self.seq, file) elif file_format == 'pickle': with open(file, 'wb') as fh: pickle.dump(self, fh) else: raise Exception("Output format {} is not implemented for sequence.write()".format(file_format))
[docs] def detect(self, lHMM=None, denovo=None, realignment='mafft', sequence_type='AA', rate_distribution=REPEAT_CONFIG['castor_parameter']['rate_distribution'], user_path=None, **kwargs): """ Detects tandem repeats on ``self.seq`` from 2 possible sources. A list of ``Repeat`` instances is created for tandem repeat detections on the sequence from two possible sources: * Sequence profile hidden Markov models ``HMM`` * de novo detection algorithms. Args: hmm (hmm.HMM): A list of ``HMM`` instances. denovo (bool): boolean realignment (str): either "mafft", "proPIP" or None *kwargs: Parameters fed to denovo TR prediction and/or Repeat instantiation. E.g. ``repeat = {"calc_score": True}`` Returns: A ``RepeatList`` instance """ realignment_types = ['mafft', 'proPIP', None] if realignment not in realignment_types: raise ValueError("Invalid realignment type. Expected one of: {}".format(realignment_types)) if lHMM: if not isinstance(lHMM, list): raise Exception('The lHMM value is not a list.') for iHMM in lHMM: if not isinstance(iHMM, hmm.HMM): raise Exception('At least one list element in the lHMM' 'value is not a valid instance of the HMM' 'class.') repeats = [] for iHMM in lHMM: # Detect TRs on self.seq with hmm using the Viterbi algorithm. most_likely_path = iHMM.viterbi(self.seq) LOG.debug("most_likely_path: {}".format(most_likely_path)) if not most_likely_path: continue unaligned_msa = hmm_viterbi.hmm_path_to_non_aligned_tandem_repeat_units( self.seq, most_likely_path, iHMM.l_effective) if len(unaligned_msa) > 1: # Align the msa initial_alignment = 'mafft' aligned_msa = repeat_align.realign_repeat(unaligned_msa, initial_alignment, sequence_type, user_path=user_path) if realignment == 'proPIP': if len(aligned_msa) > 2 or len(aligned_msa[0]) > 10: # TODO: change from 2 to 3 # only TRs with at least 3 units with a length > 10 characters will be realigned aligned_msa = repeat_align.realign_repeat(aligned_msa, realignment, sequence_type, rate_distribution=rate_distribution, user_path=user_path) if len(aligned_msa) > 1: # Create a Repeat() class with the new msa if 'repeat' in kwargs: repeats.append(repeat.Repeat(aligned_msa, **kwargs['repeat'])) else: repeats.append(repeat.Repeat(aligned_msa)) # Set begin coordinate for all repeats for i_repeat in repeats: self.repeat_in_sequence(i_repeat) return repeat_list.RepeatList(repeats) elif lHMM == []: LOG.debug("lHMM == []") return None elif denovo: if 'detection' in kwargs: predicted_repeats = repeat_detection_run.run_detector( [self], **kwargs['detection'])[0] else: predicted_repeats = repeat_detection_run.run_detector([self])[0] LOG.debug("predicted_repeats: {}".format(predicted_repeats)) repeats = [] for jTRD, jlTR in predicted_repeats.items(): for iTR in jlTR: if 'repeat' in kwargs: iTR = repeat.Repeat(iTR.msa, begin=iTR.begin, **kwargs['repeat']) else: iTR = repeat.Repeat(iTR.msa, begin=iTR.begin) # Consider only tandem repeats that have a repeat unit # predicted to be at least one character long. if iTR.l_effective > 0: # Save l, n, MSA, TRD, scores, sequence_type, position # in sequence of given type iTR.TRD = jTRD # Sanity check repeat and set begin coordinate for # all repeats if not self.repeat_in_sequence(iTR): LOG.debug("The tandem repeat is not part of" "the sequence. Detector: %s", iTR.TRD) continue repeats.append(iTR) # Realignment for iTR in repeats: if len(iTR.msa) > 2 or len(iTR.msa[0]) > 10: # TODO: change from 2 to 3 # only TRs with at least 3 units with a length > 10 characters will be realigned iTR.msa = repeat_align.realign_repeat(iTR.msa, realignment, sequence_type, rate_distribution=rate_distribution, user_path=user_path) return repeat_list.RepeatList(repeats) else: raise Exception("Either require denovo detection, or provide an", "HMM")
[docs] def get_repeatlist(self, tag): """ Retrieve `repeatlist` from this `sequence` instance. Retrieve `repeatlist` from this `sequence` instance. Access `repeatlist` as self.d_repeatlist[tag] Args: tag (str): A identifier for the repeat_list Returns: RepeatList: A repeat_list instance. """ try: return self.d_repeatlist[tag] except ValueError: logging.error("RepeatList %s not in Sequence", tag)
[docs] def set_repeatlist(self, repeatlist, tag): """ Add `repeatlist` as attribute to this `sequence` instance. Add `repeatlist` as attribute to this `sequence` instance. Access `repeatlist` as self.d_repeatlist[tag] Args: repeatlist (RepeatList): A repeat_list instance. tag (str): A identifier for the repeat_list """ self.d_repeatlist[tag] = repeatlist
def annotate(self, data, tag): self.d_annotations[tag] = data def get_annotation(self, tag): if tag in self.d_annotations: return self.d_annotations[tag] else: return []
[docs] def repeat_in_sequence(self, my_repeat): """ Sanity check whether the `repeat` is part of this `sequence`. In case, calculate the position of the `repeat` within the `sequence`. If yes: Return True, set repeat.begin to corrected value if necessary. If no: Return False. Perform sanity check on sequences where all amino acids are or are converted to standard amino acids. Args: sequence (sequence): A sequence instance. Returns: bool: True if repeat is part of sequence, else false .. todo:: Decide whether save_original_msa is needed here. """ repeat_sequence = repeat.get_repeat_sequence(my_repeat.msa_standard_aa) # The first letter in the sequence is counted as 1 # (not 0, as in Python): starts = [m.start() + 1 for m in re.finditer(repeat_sequence, self.seq_standard_aa)] if len(starts) != 0: # Is the tandem repeat predicted correctly? if not hasattr(my_repeat, "begin") or my_repeat.begin not in starts: my_repeat.begin = starts[0] my_repeat.save_original_msa(self.seq) return True else: return False