Source code for tral.sequence.sequence

# (C) 2015 Elke Schaper

"""
    :synopsis: The Sequence Class.

    .. moduleauthor:: Elke Schaper <elke.schaper@isb-sib.ch>
"""

import logging
import pickle
import re

from tral import configuration
from tral.repeat import repeat, repeat_align
from tral.repeat_list import repeat_list
from tral.hmm import hmm, hmm_viterbi
from tral.sequence import repeat_detection_run, sequence_io

CONFIG = configuration.Configuration.instance().config
REPEAT_CONFIG = CONFIG["repeat"]
LOG = logging.getLogger(__name__)


[docs]class Sequence:

    """ A ``Sequence`` describes either a protein or a DNA sequence.

    ``Sequence`` contains methods that act on single sequences, for example:

    *   tandem repeats

    Attributes:
        seq (str): The sequence.
        seq_standard_aa (str): The sequence with standard amino acids only
    """

    def __init__(self, seq, name=None, sequence_type="AA"):

        if not isinstance(seq, str):
            raise Exception('The seq value is not a String')
        self.seq = seq.upper()
        self.sequence_type = sequence_type
        for i in self.seq:
            if i not in CONFIG[self.sequence_type]['all_chars']:
                raise Exception("{} is not in CONFIG[{}]['all_chars']: {}"
                                .format(i, self.sequence_type, CONFIG['all_chars']))

        self.seq_standard_aa = repeat.standardize(self.seq, self.sequence_type)

        if name:
            self.name = name

        self.d_annotations = {}
        self.d_repeatlist = {}

[docs]    def create(file, input_format):
        """ Create sequence(s) from file.

        Create sequence(s) from file.

        Args:
            file (str): Path to input file
            format (str):  Either "fasta" or "pickle"

        .. todo:: Write checks for ``format`` and ``file``.
        """

        if input_format == 'fasta':
            l_seq = sequence_io.read_fasta(file)
            return [Sequence(iSeq, iID) for iSeq, iID in l_seq]
        if input_format == 'pickle':
            with open(file, 'rb') as fh:
                return pickle.load(fh)
        else:
            raise Exception("Input format {} is not implemented for"
                            "sequence.create()".format(input_format))

[docs]    def write(self, file, file_format):
        """ Write sequence to file.

        Write sequence to file using one of two formats.

        Args:
            file (str): Path to output file
            format (str):  Either "fasta" or "pickle"

        .. todo:: Write checks for ``format`` and ``file``.

        """

        if file_format == 'fasta':
            sequence_io.write(self.seq, file)
        elif file_format == 'pickle':
            with open(file, 'wb') as fh:
                pickle.dump(self, fh)
        else:
            raise Exception("Output format {} is not implemented for sequence.write()".format(file_format))

[docs]    def detect(self, lHMM=None, denovo=None, realignment='mafft', sequence_type='AA', rate_distribution=REPEAT_CONFIG['castor_parameter']['rate_distribution'], user_path=None, **kwargs):
        """ Detects tandem repeats on ``self.seq`` from 2 possible sources.

        A list of ``Repeat`` instances is created for tandem repeat detections
        on the sequence from two possible sources:

        * Sequence profile hidden Markov models ``HMM``
        * de novo detection algorithms.

        Args:
            hmm (hmm.HMM): A list of ``HMM`` instances.
            denovo (bool): boolean
            realignment (str): either "mafft", "proPIP" or None
            *kwargs: Parameters fed to denovo TR prediction and/or Repeat
                instantiation. E.g. ``repeat = {"calc_score": True}``

        Returns:
            A ``RepeatList`` instance
        """
        realignment_types = ['mafft', 'proPIP', None]
        if realignment not in realignment_types:
            raise ValueError("Invalid realignment type. Expected one of: {}".format(realignment_types))

        if lHMM:
            if not isinstance(lHMM, list):
                raise Exception('The lHMM value is not a list.')
            for iHMM in lHMM:
                if not isinstance(iHMM, hmm.HMM):
                    raise Exception('At least one list element in the lHMM'
                                    'value is not a valid instance of the HMM'
                                    'class.')

            repeats = []
            for iHMM in lHMM:
                # Detect TRs on self.seq with hmm using the Viterbi algorithm.
                most_likely_path = iHMM.viterbi(self.seq)
                LOG.debug("most_likely_path: {}".format(most_likely_path))
                if not most_likely_path:
                    continue
                unaligned_msa = hmm_viterbi.hmm_path_to_non_aligned_tandem_repeat_units(
                    self.seq,
                    most_likely_path,
                    iHMM.l_effective)
                if len(unaligned_msa) > 1:
                    # Align the msa
                    initial_alignment = 'mafft'
                    aligned_msa = repeat_align.realign_repeat(unaligned_msa, initial_alignment, sequence_type, user_path=user_path)

                    if realignment == 'proPIP':
                        if len(aligned_msa) > 2 or len(aligned_msa[0]) > 10:  # TODO: change from 2 to 3
                            # only TRs with at least 3 units with a length > 10 characters will be realigned
                            aligned_msa = repeat_align.realign_repeat(aligned_msa,
                                                                      realignment,
                                                                      sequence_type,
                                                                      rate_distribution=rate_distribution,
                                                                      user_path=user_path)

                    if len(aligned_msa) > 1:
                        # Create a Repeat() class with the new msa
                        if 'repeat' in kwargs:
                            repeats.append(repeat.Repeat(aligned_msa,
                                                         **kwargs['repeat']))
                        else:
                            repeats.append(repeat.Repeat(aligned_msa))

            # Set begin coordinate for all repeats
            for i_repeat in repeats:
                self.repeat_in_sequence(i_repeat)

            return repeat_list.RepeatList(repeats)

        elif lHMM == []:
            LOG.debug("lHMM == []")
            return None

        elif denovo:
            if 'detection' in kwargs:
                predicted_repeats = repeat_detection_run.run_detector(
                    [self],
                    **kwargs['detection'])[0]
            else:
                predicted_repeats = repeat_detection_run.run_detector([self])[0]

            LOG.debug("predicted_repeats: {}".format(predicted_repeats))
            repeats = []

            for jTRD, jlTR in predicted_repeats.items():
                for iTR in jlTR:
                    if 'repeat' in kwargs:
                        iTR = repeat.Repeat(iTR.msa, begin=iTR.begin,
                                            **kwargs['repeat'])
                    else:
                        iTR = repeat.Repeat(iTR.msa, begin=iTR.begin)

                    # Consider only tandem repeats that have a repeat unit
                    # predicted to be at least one character long.
                    if iTR.l_effective > 0:

                        # Save l, n, MSA, TRD, scores, sequence_type, position
                        # in sequence of given type
                        iTR.TRD = jTRD

                        # Sanity check repeat and set begin coordinate for
                        # all repeats
                        if not self.repeat_in_sequence(iTR):
                            LOG.debug("The tandem repeat is not part of"
                                      "the sequence. Detector: %s", iTR.TRD)
                            continue

                        repeats.append(iTR)

            # Realignment
            for iTR in repeats:
                if len(iTR.msa) > 2 or len(iTR.msa[0]) > 10:  # TODO: change from 2 to 3
                    # only TRs with at least 3 units with a length > 10 characters will be realigned
                    iTR.msa = repeat_align.realign_repeat(iTR.msa,
                                                          realignment,
                                                          sequence_type,
                                                          rate_distribution=rate_distribution,
                                                          user_path=user_path)
            return repeat_list.RepeatList(repeats)

        else:
            raise Exception("Either require denovo detection, or provide an",
                            "HMM")

[docs]    def get_repeatlist(self, tag):
        """ Retrieve `repeatlist` from this `sequence` instance.

        Retrieve `repeatlist` from this `sequence` instance. Access
        `repeatlist` as self.d_repeatlist[tag]

        Args:
            tag (str): A identifier for the repeat_list

        Returns:
            RepeatList: A repeat_list instance.
        """

        try:
            return self.d_repeatlist[tag]
        except ValueError:
            logging.error("RepeatList %s not in Sequence", tag)

[docs]    def set_repeatlist(self, repeatlist, tag):
        """ Add `repeatlist` as attribute to this `sequence` instance.

        Add `repeatlist` as attribute to this `sequence` instance. Access
        `repeatlist` as self.d_repeatlist[tag]

        Args:
            repeatlist (RepeatList): A repeat_list instance.
            tag (str): A identifier for the repeat_list
        """

        self.d_repeatlist[tag] = repeatlist

    def annotate(self, data, tag):

        self.d_annotations[tag] = data

    def get_annotation(self, tag):

        if tag in self.d_annotations:
            return self.d_annotations[tag]
        else:
            return []

[docs]    def repeat_in_sequence(self, my_repeat):
        """ Sanity check whether the `repeat` is part of this `sequence`. In
        case, calculate the position of the `repeat` within the `sequence`.

        If yes: Return True, set repeat.begin to corrected value if necessary.
        If no: Return False.
        Perform sanity check on sequences where all amino acids are or are
        converted to standard amino acids.

        Args:
            sequence (sequence): A sequence instance.

        Returns:
            bool: True if repeat is part of sequence, else false

        .. todo:: Decide whether save_original_msa is needed here.
        """

        repeat_sequence = repeat.get_repeat_sequence(my_repeat.msa_standard_aa)
        # The first letter in the sequence is counted as 1
        # (not 0, as in Python):
        starts = [m.start() + 1 for m in re.finditer(repeat_sequence,
                                                     self.seq_standard_aa)]

        if len(starts) != 0:  # Is the tandem repeat predicted correctly?
            if not hasattr(my_repeat, "begin") or my_repeat.begin not in starts:
                my_repeat.begin = starts[0]
            my_repeat.save_original_msa(self.seq)
            return True
        else:
            return False