Source code for tral.hmm.hmm

# (C) 2012-2015 Elke Schaper
# (C) 2014 Julia Pecerska

"""
    :synopsis: A cyclic hidden Markov model that describes sequence tandem repeats.

    .. moduleauthor:: Elke Schaper <elke.schaper@isb.sib.ch>
"""

import datetime
import logging
import numpy as np
import os
import pickle
import shutil
import subprocess
import tempfile

from tral.hmm import hmm_io, hmm_viterbi
from tral.repeat.repeat import Repeat
from tral import configuration

LOG = logging.getLogger(__name__)

CONFIG_GENERAL = configuration.Configuration.instance().config
CONFIG = CONFIG_GENERAL["hmm"]


################################### HMM class ############################
[docs]class HMM: """ Sequence profile hidden Markov models (HMMs) are used to model genomic sequence containing a tandem repeat. Note: The HMM implemented here is described in detail in Schaper, E., Gascuel, O. & Anisimova, M. Deep conservation of human protein tandem repeats within the eukaryotes. Molecular Biology and Evolution (2014). All notations used here are based on the notations introduced in Eddy, S. R. Profile hidden Markov models. Bioinformatics 14, 755–763 (1998). The HMM contains flanking states ("N", "C"), match states ("M1", "M2", ...) and insertion states ("I1", "I2", ..). As deletion states are non-emitting, they are merged with the other states. The HMM state "N" emits the sequence before the tandem repeat. The HMM state "C" emits the residual sequence after the tandem repeat. The HMM states "M1", "M2", ... can emit sequence within the tandem repeat. "M1" describes the first amino acid/nucleotide in the repeat unit, "M2" the second, and so on]. The HMM states "I1", "I2", can emit sequence representing insertions within the tandem repeat. The transition probabilities between states are stored in ``p_t``. The emission probabilities of every state are stored in ``p_e``. The probabilities for any state to be the null state are stored in ``p_0``. The structure and probabilities defining standard sequence profile HMMs can be * taken from HMM databases such as PFAM, * calculated from sequence alignments (here: an alignment of TR units) using for example the HMMER suite, * defined in house. All three ways are implemented here. The HMM is used to detect the maximum likelihood occurrence of a TR modeled by the HMM on a sequence. The Viterbi algorithm is implemented to do this detection. Attributes: id (str): The id describes the origin of the HMM. E.g., PFAMID "PF00069". states (list of str): The names of all states that the HMM contains. E.g. ["N", "M1", "I1", "M2", "I2", "C"] insertion_states (list of str): The names of all insertion states that the HMM contains. E.g. ["I1", "I2"] match_states (list of str): The names of all match states that the HMM contains. E.g. ["M1", "M2"] l_effective (int): The number of states in the HMM. E.g. l_effective = 4 alphabet (list of str): The letters that the HMM states can emit. E.g. all amino acids in the LG matrix: ["A", "C", ...] p_t (dict of dict of float): A dictionary {state0: {state1: transition probability, ...}, ...} between all `states` as log10. p_0 (dict of dict of float): A dictionary of the null probabilities to be in any of the `states`: {`states[0]`: 0.1, `states[1]`: 0.2, ...} p_e (dict of dict of float): A dictionary of emission probabilities for every state and every letter in `alphabet`: {`states[0]`: {`alphabet[0]`: emission probability, ...}, ...} hmmer (dict of dict of float): A dictionary of all information extracted from a Hmmer model. """ def __str__(self): """ Create string for HMM instance. """ try: tmp = [self.p_e[i] for i in self.match_states] tmp = "".join([max(i.keys(), key=(lambda key: i[key])) for i in tmp]) hmm = "cpHMM ID: {}\ncpHMM length: {}\n".format( self.id, self.l_effective) hmm += "Most likely motif: {}".format(tmp) except (AttributeError, TypeError): hmm = "<HMM instance>" LOG.warning("Could not create string of HMM instance.") return hmm # Use functions defined in other methods as class functions. # Static function
[docs] @staticmethod def read(hmm_filename, id=None): """Read HMM file in HMMER3 format. HMMER3 file format is described in detail in ftp://selab.janelia.org/pub/software/hmmer3/3.0/Userguide.pdf, section 8. The general file format is:: HMMER3/b [3.0b2 | June 2009] NAME fn3 ACC PF00041.12 DESC Fibronectin type III domain LENG 86 ALPH amino (...) HMM A C D E F G H I (...) Y m->m m->i m->d i->m i->i d->m d->d COMPO 2.70271 4.89246 3.02314 2.64362 3.59817 2.82566 3.74147 3.08574 (...) 3.22607 2.68618 4.42225 2.77519 2.73123 3.46354 2.40513 3.72494 3.29354 (...) 3.61503 0.00338 6.08833 6.81068 0.61958 0.77255 0.00000 * 1 3.16986 5.21447 4.52134 3.29953 4.34285 4.18764 4.30886 3.35801 (...) 3.93889 1 - - 2.68629 4.42236 2.77530 2.73088 3.46365 2.40512 3.72505 3.29365 (...) 3.61514 0.09796 2.38361 6.81068 0.10064 2.34607 0.48576 0.95510 2 2.70230 5.97353 2.24744 2.62947 5.31433 2.60356 4.43584 4.79731 (...) 4.25623 3 - - 2.68618 4.42225 2.77519 2.73123 3.46354 2.40513 3.72494 3.29354 (...) 3.61503 0.00338 6.08833 6.81068 0.61958 0.77255 0.48576 0.95510 (...) 86 3.03720 5.94099 3.75455 2.96917 5.26587 2.91682 3.66571 4.11840 (...) 4.99111 121 - E 2.68618 4.42225 2.77519 2.73123 3.46354 2.40513 3.72494 3.29354 (...) 3.61503 0.00227 6.08723 * 0.61958 0.77255 0.00000 * // .. important:: The probabilities in this format are stored as negative natural log probabilities, e.g. -ln(0.25) = 1.38629. The special case of 0 probability is stored as ``*`` which in fact means -∞ (minus infinity). Args: hmm_filename (str): Path to the file with model data in the HMMER3 file format. id (str, optional): The identifier for the model to be returned. E.g. for Pfam the ``id`` can look like this: 'PF00560'. If defined, the function returns only the HMM with an identifier that matches the provided id. If a model in the file does not have an identifier to check against, it is skipped. Returns: dict: A dictionary of parameters required to initialise the HMM including the id, alphabet, emission probabilities and transition probabilities. Output format:: { 'id': 'PF08261.7', 'letters': ['A', 'C', 'D', ..., 'Y'], 'COMPO': {'insertion_emissions': [2.68618, 4.42225, ..., 3.61503], 'emissions': [2.28205, 5.14899, ..., 1.92022], 'transitions': [0.01467, 4.62483, ..., -inf]}, '1': {'insertion_emissions': [2.68618, 4.42225, ..., 3.61503], 'emissions': [1.00089, 4.54999, ..., 5.23581], 'transitions': [0.01467, 4.62483, ..., 0.95510]}, ... '8': {'insertion_emissions': [2.68618, 4.42225, ..., 3.61503], 'emissions': [4.12723, 5.39816, ..., 4.58094], 'transitions': [0.00990, 4.62006, ..., -inf]} } See Also: hmm_io.read() """ return hmm_io.read(hmm_filename, id)
# Function that take the HMM instance as argument: def viterbi(self, *args): return hmm_viterbi.viterbi(self, *args) def __init__(self, hmmer_probabilities, sequence_type="AA"): """ HMM class __init_ module. __init__ takes HMM parameters (including the alphabet, emission probabilities and transition probabilities) as input, and assigns them to class attributes. Args: hmmer_probabilities (dict): A dictionary with HMM parameters. """ self.hmmer = hmmer_probabilities self.id = hmmer_probabilities['id'] self.alphabet = self.hmmer['letters'] self.sequence_type = sequence_type self.l_effective = max([int(key) for key in hmmer_probabilities.keys() if key.isdigit()]) # Initialise all HMM states to default value (e.g. transition to or # from terminal state all have the same cost 0). self.initialise_HMM_structure(self.l_effective) d_translate_states = { i: i[ 1:] for i in self.match_states + self.insertion_states} for i in self.terminal_states: d_translate_states[i] = "COMPO" # Store null model emission probabilities for ["N","C"] # from hmmer_probabilities["COMPO"] # Store match_state and insertion_state emission probabilities for # ["M0",...] from hmmer_probabilities["0"],... for i_state in self.match_states: i_emission_probabilities = self.hmmer[ d_translate_states[i_state]]['emissions'][:len(self.alphabet)] self.set_emission_probability_hmmer3( i_state, i_emission_probabilities) for i_state in (self.insertion_states + self.terminal_states): i_emission_probabilities = \ self.hmmer[d_translate_states[i_state]]['insertion_emissions'][:len(self.alphabet)] self.set_emission_probability_hmmer3( i_state, i_emission_probabilities) # Store transition probabilities # (m->m m->i m->d i->m i->i d->m d->d): # First: All state defined in HMMer model for i, i_state in enumerate(self.match_states): i_transition_probabilities = self.hmmer[ d_translate_states[i_state]]['transition'] # Completing the circle of the HMM. if i == self.l_effective - 1: i_transition_probabilities = self.set_circle_transition_probability_hmmer3( i_transition_probabilities, d_translate_states[i_state]) LOG.debug("i_state: %s", i_state) LOG.debug( "set_circle_transition_probability_hmmer3: %s", i_transition_probabilities) self.set_transition_probability_hmmer3( i, i_transition_probabilities) LOG.debug("p_t before deletion: %s", self.p_t) # Translate deletion states into direct transitions from match to match # states. if self.l_effective > 1: p_t_d = {} for i, i_match_state in enumerate(self.match_states): p_t_d[i_match_state] = self.get_direct_transition_probabilities_for_deletions( i, i_match_state) for i_match_state in self.match_states: for i_goal_state, iP in p_t_d[i_match_state].items(): self.p_t[i_match_state][i_goal_state] = iP LOG.debug("p_t after deletion: %s", self.p_t) # As it is never likely to got from a terminal state to a deletion # tate or vice versa, this transition probabilities can be ignored. # Thus, you may advance and: # Delete everything you ever knew about deletion states. self.states = [self.terminal_states[ 0]] + self.match_states + self.insertion_states + [self.terminal_states[1]] self.p_t = {i_state: {i: j for i, j in self.p_t[ i_state].items() if i not in self.deletion_states} for i_state in self.states} self.p_0 = { i: j for i, j in self.p_0.items() if i not in self.deletion_states} del self.deletion_states
[docs] @staticmethod def create(input_format, file=None, repeat=None, **kwargs): """ Creates a HMM instance from 2 possible input formats. A `HMM` instance is created from one of the two possible inputs: * HMMER3 model * ``Repeat`` instance Args: input_format (str): The file format of the input file file (str): Path to the file containing the HMM parameters encoded in the HMMER3 format. repeat (Repeat): A Repeat object with an MSA that can be transformed into an HMM. Returns: `HMM`: An initialized instance of the `HMM` class. Raises: Exception: if the HMMER3 file provided does not exist. Exception: if the repeat value provided is not an instance of the Repeat class. Exception: if no parameters are provided to create the HMM from. .. warning:: Will initialize all HMM's in a file but will only return the first one. .. todo:: Fix the previous warning (agree on the way how it should be done first) """ if input_format == 'hmmer': # Read only first element from HMMER3 file. if not os.path.exists(file): raise Exception('HMMER3 file does not exist.') hmmer_probabilities = next(HMM.read(file)) sequence_type = "AA" elif input_format == 'pickle': with open(file, 'rb') as fh: return pickle.load(fh) elif input_format == 'repeat': if not isinstance(repeat, Repeat): raise Exception('The repeat value is not a valid instance of ' 'the Repeat class.') if 'hmmbuild' in kwargs: hmmer_probabilities = HMM.create_from_repeat(repeat, **kwargs['hmmbuild']) else: hmmer_probabilities = HMM.create_from_repeat(repeat) sequence_type = repeat.sequence_type else: raise Exception("Unknown input format: {}.".format(input_format)) LOG.debug(hmmer_probabilities) return HMM(hmmer_probabilities, sequence_type)
[docs] @staticmethod def create_from_repeat(tandem_repeat, hmm_copy_path=None, hmm_copy_id=None): """ Get HMM parameters (including the alphabet, emission probabilities and transition probabilities) from a tandem repeat. An HMM is created from ``tandem_repeat`` using :command:`hmmbuild` and is saven in the HMMER3 file format. Next, HMM parameters are retrieved from the HMMER3 file and returned. Args: tandem_repeat (TR): A Repeat class instance of the TR to be transformed into an HMM. hmm_copy_path (str): Path to where a copy of the created HMM will be stored. If None, no copies are saved. hmm_copy_id (str): HMM id which will serve as the filename in case a copy should be saved. If None and ``hmm_copy_path`` is specified the name is generated from current date and time combination. Returns: HMM parameters (dict): For a sample structure please refer to :py:hmm_io.read:. .. todo:: Make sure the id is correctly read in. """ timestamp = datetime.datetime.now() tmp_id = timestamp.strftime('%Y:%m:%d:%H:%M:%S:%f') # Create a temporary directory tmp_dir = tempfile.mkdtemp() # Save TR as Stockholm file stockholm_file = os.path.join(tmp_dir, tmp_id + ".sto") # O repeat_io.save_repeat_stockholm(tandem_repeat.msaD, stockholm_file) tandem_repeat.write(file=stockholm_file, file_format="stockholm") # Define hmmbuild sequence type flag. if tandem_repeat.sequence_type == "AA": sequence_type_flag = "--amino" elif tandem_repeat.sequence_type == "RNA": sequence_type_flag = "--rna" else: sequence_type_flag = "--dna" # Run HMMbuild to build a HMM model, and read model p = subprocess.Popen([CONFIG["hmmbuild"], sequence_type_flag, tmp_id + ".hmm", tmp_id + ".sto"], stdout=subprocess.PIPE, stderr=None, cwd=tmp_dir) p.wait() hmm_file = os.path.join(tmp_dir, tmp_id + ".hmm") if hmm_copy_path: if not os.path.exists(hmm_copy_path): LOG.critical('Specified path for the file copy does not ' 'exist, not saving copy: %s.', hmm_copy_path) else: if hmm_copy_id: shutil.copy(hmm_file, os.path.join(hmm_copy_path, hmm_copy_id + ".hmm")) else: shutil.copy(hmm_file, hmm_copy_path) hmmer_probabilities = next( HMM.read( hmm_filename=hmm_file, id=hmm_copy_id)) shutil.rmtree(tmp_dir) return hmmer_probabilities
[docs] def write(self, file, output_format, *args): """ Write ``HMM`` to file. Write ``HMM`` to file. Currently, only pickle is implemented. Args: file (str): Path to input file output_format (str): Either "fasta", "pickle" or "stockholm" .. todo:: Write checks for ``format`` and ``file``. """ if output_format == 'pickle': with open(file, 'wb') as fh: pickle.dump(self, fh) else: raise Exception('output_format is unknown: {}'.format(output_format))
[docs] def hmm_example(self): """ .. todo:: Is this method needed? """ # states = ["N", "B", "M1", "M2", "M3", "E", "C"] self.states = ["N", "M1", "M2", "C"] # Initialisation self.p_t = {iS: {iS2: 0 for iS2 in self.states} for iS in self.states[:-1]} self.p_0 = {iS: 1 / 3 for iS in self.states} # Transition probabilities # Feed Values to p_t self.p_t["N"] = {"N": 0.5, "M1": 0.5} self.p_t["M1"] = {"M2": 0.5, "C": 0.5} self.p_t["M2"] = {"M1": 0.5, "C": 0.5} self.p_t["C"] = {"C": 1} # emissions self.emissions = ["A", "C", "G", "T"] # emission probabilities self.p_e = {iS: {iE: 0.25 for iE in self.emissions} for iS in self.states} self.p_e['M1'] = {"A": 0.9, "C": 0.025, "G": 0.025, "T": 0.025} self.p_e['M2'] = {"A": 0.025, "C": 0.9, "G": 0.025, "T": 0.025}
[docs] def initialise_HMM_structure(self, l_effective): """ Initialise the HMM with all states and standard values for transition and emission probabilities. Initialise all states Set transition probabilities to None/0 Set initial probability for all states equal. Args: l_effective (int): The number of match states in the HMM. """ # Build a HMM from these likelihoods. self.match_states = ["M{0}".format(str(i + 1)) for i in range(l_effective)] self.insertion_states = ["I{0}".format(str(i + 1)) for i in range(l_effective)] self.deletion_states = ["D{0}".format(str(i + 1)) for i in range(l_effective)] self.terminal_states = ["N", "C"] self.states = [self.terminal_states[0]] + self.match_states + \ self.insertion_states + self.deletion_states + [self.terminal_states[1]] LOG.debug("HMM states: %s", self.states) # Initialisation # The transition probability is initially set to None for all states. self.p_t = {iS: {iS2: None for iS2 in self.states} for iS in self.states} # The initial probability is equal in all states. (np.log10(1)=0) self.p_0 = {iS: 0 for iS in self.states} # Transition probabilities # First, mark which transitions are not penalised, # that is bear a 'probability of np.log10(1)=0': # - Transitions from "N" to any "M_k" # - Transitions from any "M_k" to "C" # - Transitions from "M_k" to "M_k+1" (for k`n) and "M_k" to "M_0" (for k==n) # - Transitions from "N" to "N" # - Transitions from "C" to "C" for i in range(l_effective): self.p_t["N"]["M{0}".format(i % l_effective + 1)] = 0 self.p_t["M{0}".format(i % l_effective + 1)]["C"] = 0 self.p_t["M{0}".format(i % l_effective + 1)]["M{0}".format((i + 1) % l_effective + 1)] = 0 self.p_t["N"]["N"] = 0 self.p_t["C"]["C"] = 0 self.p_e = {}
[docs] def set_emission_probability_hmmer3(self, state, l_emission_probabilities): """ Convert and set emission probabilities from HMMER3 file to class attribute. Set p_e of states to `l_emission_probabilities` for `state` given `self.alphabet`. In HMMER3 data, emission probabilities are -ln(p). It is e.g.:: self.alphabet = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] Return log10(p), that is convert between the two. Conversion: p_Local = - p_HMM * log10(e) Args: state (str): The state of the HMM. E.g. the fourth match state 'M4'. l_emission_probabilities (list of float): E.g. ['3.27687', '2.31397', '3.32252', '3.12746', '2.89175', '3.34719', '2.28730', '3.54139', '2.53154', '2.64774', '3.75733', '3.23860', '3.57894', '3.26290', '2.66343', '2.61544', '2.91770', '3.26739', '5.09378', '3.16816'] """ l_emission_probabilities = \ [-(i) * np.log10(np.exp(1)) for i in l_emission_probabilities] self.p_e[state] = \ {iL: iEP for iL, iEP in zip(self.alphabet, l_emission_probabilities)}
[docs] def set_circle_transition_probability_hmmer3( self, l_transition_probabilities, final_state): """ Calculate transition probabilites between states that exist due to the circularity of the tandem repeat HMM. In HMMER3 models, no transition in circle from the last match state to the first match state are included. Values need to be found for the following transitions: m->m m->i m->d i->m i->i d->m d->d Compare ftp://selab.janelia.org/pub/software/hmmer3/3.0/Userguide.pdf pp.84-85. Input: -ln(p) Output: Also -ln(p') When the HMM only has a single match state, i.e. l_effective == 1, the final match state equals the only match state, and no transition probabilities can be calculated via averaging over all other states. Therefore, the its transition probabilities remain unchanged. Args: l_transition_probabilities (list of float): The first parameter. final_state (str): The second parameter. Returns: list of float: transition probabilities """ if self.l_effective == 1: return l_transition_probabilities # In self.hmmer, besides the probabilities for states 1, 2, 3, ... data # for "COMPO", 'letters', and 'id' are saved and should be ignored in # the following. # Also, we ignore the transition probabilities for final_state, as our # goal here is to recalculate these based on the averages of the # transition probabilities from all other states. skip_states = ['COMPO', 'letters', final_state, 'id'] # M->M, M->I, M->D: Use an average from all other transitions from # match states. LOG.debug("self.hmmer.keys(): %s", self.hmmer.items()) LOG.debug("self.hmmer.items(): %s", self.hmmer.items()) mean_M_M = np.mean([np.exp(-iP["transition"][0]) for i_state, iP in self.hmmer.items() if i_state not in skip_states]) mean_M_I = np.mean([np.exp(-iP["transition"][1]) for i_state, iP in self.hmmer.items() if i_state not in skip_states]) mean_M_D = np.mean([np.exp(-iP["transition"][2]) for i_state, iP in self.hmmer.items() if i_state not in skip_states]) sum_p = np.sum([mean_M_M, mean_M_I, mean_M_D]) l_transition_probabilities[ :3] = [-np.log(i / sum_p) for i in [mean_M_M, mean_M_I, mean_M_D]] # I->M and I->I: Use an average from all other transitions from # insertion states. mean_I_M = np.mean([np.exp(-iP["transition"][3]) for i_state, iP in self.hmmer.items() if i_state not in skip_states]) mean_I_I = np.mean([np.exp(-iP["transition"][4]) for i_state, iP in self.hmmer.items() if i_state not in skip_states]) sum_p = np.sum([mean_I_M, mean_I_I]) l_transition_probabilities[ 3:5] = [-np.log(i / sum_p) for i in [mean_I_M, mean_I_I]] # D->M and D->D: Use an average from all other transitions from # deletion states. mean_D_M = np.mean([np.exp(-iP["transition"][5]) for i_state, iP in self.hmmer.items() if i_state not in skip_states]) mean_D_D = np.mean([np.exp(-iP["transition"][6]) for i_state, iP in self.hmmer.items() if i_state not in skip_states]) sum_p = np.sum([mean_D_M, mean_D_D]) l_transition_probabilities[ 5:] = [-np.log(i / sum_p) for i in [mean_D_M, mean_D_D]] return l_transition_probabilities
[docs] def set_transition_probability_hmmer3( self, state_index, l_transition_probabilities): """ Convert and assign transition probabilities from a HMMER3 file to HMM attribute `p_t`. Convert and assign transition probabilities from a HMMER3 file to HMM attribute `p_t`. Set ``p_t`` of states to ``l_transition_probabilities`` for the ``state_index`` th state. In HMMER3 data, transition probabilities are -ln(p). Return log10(p), i.d. convert between the two. Conversion: p_Local = - p_HMM * log10(e) Args: state_index (int): E.g. 4 l_transition_probabilities (list of float): A list of transition probabilities as found in a single row for a certain state in HMMER3 files. E.g. ['0.00021', '8.85487', '9.57722', '0.61958', '0.77255', '0.48576', '0.95510'] """ l_transition_probabilities = [-(i) * np.log10(np.exp(1)) for i in l_transition_probabilities] # Match state state_index -> Match state state_index + 1 iM = self.match_states[state_index] iM_1 = self.match_states[(state_index + 1) % self.l_effective] self.p_t[iM][iM_1] = l_transition_probabilities[0] # Match state state_index -> Insertion state state_index self.p_t[iM][ self.insertion_states[state_index]] = l_transition_probabilities[1] # Match state state_index -> Deletion state state_index + 1 iD_1 = self.deletion_states[(state_index + 1) % self.l_effective] self.p_t[iM][iD_1] = l_transition_probabilities[2] # Insertion state state_index -> Match state state_index + 1 iI = self.insertion_states[state_index] self.p_t[iI][iM_1] = l_transition_probabilities[3] # Insertion state state_index -> Insertion state state_index iI = self.insertion_states[state_index] self.p_t[iI][iI] = l_transition_probabilities[4] # Deletion state state_index -> Match state state_index + 1 iD = self.deletion_states[state_index] self.p_t[iD][iM_1] = l_transition_probabilities[5] # Deletion state state_index -> Deletion state state_index + 1 self.p_t[iD][iD_1] = l_transition_probabilities[6]
[docs] def get_direct_transition_probabilities_for_deletions( self, state_index, state): """ Calculate updated transition probabilities for HMMs without deletion states. Calculate updated transition probabilities for HMMs without deletion states. Deletion states are non-emitting states. Therefore, they can be merged into the remaining emitting states. E.g. M -> D -> I would be equivalent to M -> I. The total transition probability from M to I would then be the product of the transition probabilities from M to D and from D to I (equivalent to the sum of the log10 probabilities.). This method translates all deletions following `state` with `state_index` into direct match state to match state transitions, and returns the new transition probabilities. Args: state_index (int): The first parameter. state (str): The second parameter. Returns: {state1: float}: The transition probability from `state` with `state_index` to `state1` is calculated for all (?) possible `state1`. """ transition = {} p_deletion_state = self.p_t[state][ self.deletion_states[ (state_index + 1) % self.l_effective]] for i_state_index in range(1, self.l_effective): i_deleted_state_index = (state_index + i_state_index) % self.l_effective i_deleted_state = self.deletion_states[i_deleted_state_index] i_goal_state_index = (state_index + i_state_index + 1) % self.l_effective i_goal_state = self.match_states[i_goal_state_index] transition[i_goal_state] = p_deletion_state + \ self.p_t[i_deleted_state][i_goal_state] p_deletion_state += self.p_t[i_deleted_state][ self.deletion_states[i_goal_state_index]] return transition
############################### External parameters ######################
[docs]def hmmer3_emission_probabilities(hmmer_probabilities, letters, l_match): ''' Get emission probabilities from hmmer3 hmm file. In hmm file, emission probabilities are -ln(p). Return log10(p), i.d. convert between the two. Conversion: p_Local = - p_HMM * log10(e) Parameters (e.g.):: letters = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y'] l_match = ['M'+str(i) for i in range(24)] Return format (pseudo code): [{iA: np.log10(p(iA,iM)) for iA in alphabet.keys()} for iM in l_match] ''' # Test: Is the number of match states in both models equal? if not len(hmmer_probabilities.keys()) - 2 == len(l_match): print('Match states HMMER: {0} Match states local: {1}'.format( len(hmmer_probabilities.keys()) - 2, len(l_match))) raise ValueError('The number of match states in HMMer model and local' 'model does not match') # Test: Are all `letters` represented in the HMMER HMM? if any(iL not in hmmer_probabilities['letters'] for iL in letters): missing = [ iL for iL in letters if iL not in hmmer_probabilities['letters']] print('Missing representation in Hmmer File: {0}'.format(missing)) raise ValueError('Some letters in the local HMM are not represented', 'in the HMMER HMM.') return [{iL: -(iP) * np.log10(np.exp(1)) for iL, iP in zip(hmmer_probabilities['letters'], data['emissions']) if iL in letters} for key, data in hmmer_probabilities.items() if key not in ['letters', 'COMPO']]