Source code for tral.repeat.repeat_score

# (C) 2011 Alexander Korsunsky
# (C) 2011-2015 Elke Schaper

"""
    :synopsis: The repeat score module.

    .. moduleauthor:: Elke Schaper <elke.schaper@isb-sib.ch>
"""

import collections
import logging
import numpy as np
import os
import re
import scipy.stats
import scipy.special
import scipy.linalg

from tral import configuration
from tral.paths import config_file

LOG = logging.getLogger(__name__)

CONFIG_GENERAL = configuration.Configuration.instance().config
CONFIG = CONFIG_GENERAL["repeat_score"]

# ############# REPEAT SCORE CALCULATION FUNCTIONS ############################
# ############# 1. SIMILARITY SCORES ##########################################


[docs]def load_equilibrium_freq(file_name): """ Load equilibrium frequencies from a substitution matrix file. Load equilibrium frequencies from a substitution matrix file. Note that with minor changes, the whole amino acid substitution matrix could be returned. Args: file_name (str): Path to substitution matrix file, E.g. LG.dat. Returns: (Dict of str, float): Dictionary of one letter amino acid codes and float value """ patstr_freqline = r"[+-]?\d+(?:\.\d+)?\s*" with open(file_name, "r") as f: # Create empty 20x20 matrix substitution_matrix = [[0.0] * 20] * 20 row_count = 0 for row in f: # print("row %d: " % row_count, row) row = row.strip() pat = r"\s*" + patstr_freqline * (row_count + 1) match = re.findall(pat, row) if not match: continue # store read lines into matrix substitution_matrix[row_count][0:row_count + 1] = \ map(float, row.split()) row_count = row_count + 1 # Read 19 lines of substitution matrix if row_count == 19: break else: # EOF before reading the frequency line raise ValueError("Input file is malformed! Did not find enough", "lines for the substitution matrix.") frequencies = [] for row in f: matches = re.findall(patstr_freqline, row) frequencies.extend(map(float, matches)) if len(frequencies) >= 20: break else: # EOF before finishing the frequency line raise ValueError("Input file is malformed! Did not find enough", "numbers for equilibrium frequencies.") substitution_matrix[19][0:20] = frequencies[0:20] # return substitution_matrix return {aa: freq for aa, freq in zip(["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"], substitution_matrix[19])}
[docs]def mean_similarity(repeat, measure_of_similarity, ignore_gaps=CONFIG['indel'].as_bool('ignore_gaps')): """ Calculate the mean similarity of all columns in the multiple sequence alignment in ``repeat`` according to a ``measure_of_similarity``. Args: repeat (Repeat): An instance of the Repeat class. measure_of_similarity (str): Either "entropy", "parsimony", or "pSim". ignore_gaps (boolean): Are gaps, "-" treated as chars (False) or ignored (True). Returns: float: A similarity measure for the repeat units in ``repeat``. """ if not ignore_gaps: return sum(map(measure_of_similarity, repeat.msaTD)) / float(repeat.l) else: return sum(map( lambda column: measure_of_similarity(column.replace("-", "")), repeat.msaTD)) / repeat.l_effective # python 2: integer division
[docs]def entropy(column): """ Calculate the entropy of a list. Calculate the entropy of list. Assume that the frequency of elements in the list reflects a probability density estimator. For an exact description see: Schaper, E., Kajava, A., Hauser, A., & Anisimova, M. Repeat or not repeat? --Statistical validation of tandem repeat prediction in genomic sequences. Nucleic Acids Research (2012). Args: column (list of str): List of elements that can be ordered in a set, e.g. str or int. Returns: float: The entropy of ``column``. """ estimatedP = [(column.count(a) / float(len(column))) for a in set(column)] return sum([-p * np.log2(p) for p in estimatedP if p > 0])
[docs]def parsimony(column): """ Calculate the parsimony score of a list. Calculate the parsimony score on ``column``: Count the number of changes in a ``column`` of a multiple sequence alignment and divide by the maximally possible number of changes. For an exact description see: Schaper, E., Kajava, A., Hauser, A., & Anisimova, M. Repeat or not repeat? --Statistical validation of tandem repeat prediction in genomic sequences. Nucleic Acids Research (2012). Args: column (list of str): A ``list`` of two or more elements that can be ordered in a ``set``, e.g. ``str`` or ``int`` Returns: float: The parsimony score of ``column``. .. todo:: At which step shall we assert that the length of ``column`` > 1? """ if len(column) > 1: # Here, 0 means no changes and 1 only changes # python 2: integer # division return (len(set(column)) - 1) / (len(column) - 1) else: LOG.warning("Repeats with n = 1 are not repeats") # raise AssertionError("Repeats with n = 1 are not repeats") return 1
[docs]def pSim(column): """ Calculate the pSim score of a list. Calculate the pSim score on ``column``: Count the frequency of the most frequent element in a ``column`` of a multiple sequence alignment. For an exact description see: Schaper, E., Kajava, A., Hauser, A., & Anisimova, M. Repeat or not repeat? --Statistical validation of tandem repeat prediction in genomic sequences. Nucleic Acids Research (2012). Args: column (list of str): A list of elements. Returns: float: The pSim score of ``column``. """ return collections.Counter(column).most_common( 1)[0][1] / float(len(column))
# ####### 2. MAXIMUM LIKELIHOOD CALCULATIONS ################################
[docs]def optimisation(function, args, start_min=CONFIG['optimisation'].as_float('start_min'), start_max=CONFIG['optimisation'].as_float('start_max'), n_iteration=CONFIG['optimisation'].as_int('n_iteration')): """Binary one-dimensional optimisation of ``function``. Perform a binary one dimensional optimisation of parameter ``t`` on ``function`` with the additional list of arguments/parameters ``args``. Start on a given range of start, and iterate ``n_iteration`` times. Return the maximum of the function, together with the maximum parameter ``t`` as a tuple. In addition to ``start_min`` and ``start_max``, additional hard boundaries on the optimisation parameter ``t`` are hard-coded: t E [0.0000000001, 100] Args: function (method): Method with input (float, ``args``) and output float. args (list of items): List of arguments for ``function``. Kwargs: start_min (float): Minimum value of function parameter ``t``. start_max (float): Maximum value of function parameter ``t``. n_iteration (int): Number of iterations until optimisation results are returned. Returns: tuple (Maximum of function: float, Corresponding function parameter: float) .. todo:: Check why ``region_bounded`` is hardcoded at current. .. todo:: Check why ``stepsize`` is hardcoded at current. .. todo:: Generalise hard-coded optimisation parameter boundaries. """ t = [[start_min], [(start_min + start_max) / 2], [start_max]] stepsize = 2 for iT in t: iT.append(function(iT[0], *args)) region_bounded = False count = 0 while count < n_iteration: count += 1 if not region_bounded: # First entry is best - shift trial out zone to left if t[0][1] > t[1][1]: t[2] = t[1][:] t[1] = t[0][:] t[0] = [max(0.0000000001, t[0][0] - stepsize)] t[0].append(function(t[0][0], *args)) # Last entry is best - shift trial out zone to right elif t[2][1] > t[1][1]: t[0] = t[1][:] t[1] = t[2][:] t[2] = [min(100, t[2][0] + stepsize)] t[2].append(function(t[2][0], *args)) else: region_bounded = True if region_bounded: # calc half_points half_points = [[(t[0][0] + t[1][0]) / 2], [(t[1][0] + t[2][0]) / 2]] for iT in half_points: iT.append(function(iT[0], *args)) # First halfPoint is best - shift it to be the next midPoint if half_points[0][1] > t[1][1]: t[2] = t[1][:] t[1] = half_points[0][:] # Last halfPoint is best - shift it to be the next midPoint elif half_points[1][1] > t[1][1]: t[0] = t[1][:] t[1] = half_points[1][:] # Former midPoint is still the best - adjust the boundaries else: t[0] = half_points[0][:] t[2] = half_points[1][:] return (t[1][0], t[1][1])
def load_model(evolution_model='lg'): # Prepare the model of repeat evolution if evolution_model == 'lg': # 1. Substitutions are modeled by the LG matrix aa, eq_freq, substitution_matrix = LG() else: aa, eq_freq, substitution_matrix = K80() # equilibrium_freq = {i: j for i, j in zip(aa, eq_freq)} alphabet = {b: a for a, b in enumerate(aa)} length = len(aa) eq_freq = np.array(eq_freq) eq_freq = eq_freq / sum(eq_freq) S = np.array(substitution_matrix) PI = np.diag(eq_freq) Q = np.dot(S, PI) for i in range(length): Q[i, i] = - sum(Q[i][:i]) - sum(Q[i][i + 1:]) diagonal = [Q[i, i] for i in range(length)] Q *= - sum((i * j for i, j in zip(diagonal, eq_freq))) return (Q, eq_freq, alphabet)
[docs]def TN93(alpha_1=float(CONFIG['TN93']['alpha_1']), alpha_2=float(CONFIG['TN93']['alpha_2']), beta=float(CONFIG['TN93']['beta'])): '''TN93''' # 4*4 matrix with α1 = 0.3, α2 = 0.4, β = 0.7 according to TN93 Q = [[-(alpha_1 + 2 * beta), alpha_1, beta, beta], [alpha_1, -(alpha_1 + 2 * beta), beta, beta], [beta, beta, -(alpha_2 + 2 * beta), alpha_2], [beta, beta, alpha_2, -(alpha_2 + 2 * beta)]] return ['T', 'C', 'A', 'G'], [0.25, 0.25, 0.25, 0.25], Q
[docs]def K80(kappa=float(CONFIG['K80']['kappa'])): ''' K80 ''' return TN93(alpha_1=kappa, alpha_2=kappa, beta=1)
[docs]def LG(): '''LG''' return(['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'], [0.079066, 0.055941, 0.041977, 0.053052, 0.012937, 0.040767, 0.071586, 0.057337, 0.022355, 0.062157, 0.099081, 0.0646, 0.022951, 0.042302, 0.04404, 0.061197, 0.053287, 0.012066, 0.034155, 0.069147], [[-16.060388, 0.425093, 0.276818, 0.395144, 2.489084, 0.969894, 1.038545, 2.06604, 0.358858, 0.14983, 0.395337, 0.536518, 1.124035, 0.253701, 1.177651, 4.727182, 2.139501, 0.180717, 0.218959, 2.54787], [0.425093, -21.193959, 0.751878, 0.123954, 0.534551, 2.807908, 0.36397, 0.390192, 2.426601, 0.126991, 0.301848, 6.326067, 0.484133, 0.052722, 0.332533, 0.858151, 0.578987, 0.593607, 0.31444, 0.170887], [0.276818, 0.751878, -17.692284, 5.076149, 0.528768, 1.695752, 0.541712, 1.437645, 4.509238, 0.191503, 0.068427, 2.145078, 0.371004, 0.089525, 0.161787, 4.008358, 2.000679, 0.045376, 0.612025, 0.083688], [0.395144, 0.123954, 5.076149, -24.184967999999994, 0.062556, 0.523386, 5.24387, 0.844926, 0.927114, 0.01069, 0.015076, 0.282959, 0.025548, 0.017416, 0.394456, 1.240275, 0.42586, 0.02989, 0.135107, 0.037967], [2.489084, 0.534551, 0.528768, 0.062556, -13.058626, 0.084808, 0.003499, 0.569265, 0.640543, 0.320627, 0.594007, 0.013266, 0.89368, 1.105251, 0.075382, 2.784478, 1.14348, 0.670128, 1.165532, 1.959291], [0.969894, 2.807908, 1.695752, 0.523386, 0.084808, -18.724285999999996, 4.128591, 0.267959, 4.813505, 0.072854, 0.582457, 3.234294, 1.672569, 0.035855, 0.624294, 1.223828, 1.080136, 0.236199, 0.257336, 0.210332], [1.038545, 0.36397, 0.541712, 5.24387, 0.003499, 4.128591, -23.8339, 0.348847, 0.423881, 0.044265, 0.069673, 1.807177, 0.173735, 0.018811, 0.419409, 0.611973, 0.604545, 0.077852, 0.120037, 0.245034], [2.06604, 0.390192, 1.437645, 0.844926, 0.569265, 0.267959, 0.348847, -16.613733, 0.311484, 0.008705, 0.044261, 0.296636, 0.139538, 0.089586, 0.196961, 1.73999, 0.129836, 0.268491, 0.054679, 0.076701], [0.358858, 2.426601, 4.509238, 0.927114, 0.640543, 4.813505, 0.423881, 0.311484, -6.179003, 0.108882, 0.366317, 0.697264, 0.442472, 0.682139, 0.508851, 0.990012, 0.584262, 0.597054, 5.306834, 0.119013], [0.14983, 0.126991, 0.191503, 0.01069, 0.320627, 0.072854, 0.044265, 0.008705, 0.108882, -22.217359, 4.145067, 0.159069, 4.273607, 1.112727, 0.078281, 0.064105, 1.033739, 0.11166, 0.232523, 10.649107], [0.395337, 0.301848, 0.068427, 0.015076, 0.594007, 0.582457, 0.069673, 0.044261, 0.366317, 4.145067, -31.146227, 0.1375, 6.312358, 2.592692, 0.24906, 0.182287, 0.302936, 0.619632, 0.299648, 1.702745], [0.536518, 6.326067, 2.145078, 0.282959, 0.013266, 3.234294, 1.807177, 0.296636, 0.697264, 0.159069, 0.1375, -19.90551, 0.656604, 0.023918, 0.390322, 0.748683, 1.136863, 0.049906, 0.131932, 0.185202], [1.124035, 0.484133, 0.371004, 0.025548, 0.89368, 1.672569, 0.173735, 0.139538, 0.442472, 4.273607, 6.312358, 0.656604, -24.724051, 1.798853, 0.099849, 0.34696, 2.020366, 0.696175, 0.481306, 1.898718], [0.253701, 0.052722, 0.089525, 0.017416, 1.105251, 0.035855, 0.018811, 0.089586, 0.682139, 1.112727, 2.592692, 0.023918, 1.798853, -23.599417, 0.094464, 0.361819, 0.165001, 2.457121, 7.803902, 0.654683], [1.177651, 0.332533, 0.161787, 0.394456, 0.075382, 0.624294, 0.419409, 0.196961, 0.508851, 0.078281, 0.24906, 0.390322, 0.099849, 0.094464, -16.74927, 1.338132, 0.571468, 0.095131, 0.089613, 0.296501], [4.727182, 0.858151, 4.008358, 1.240275, 2.784478, 1.223828, 0.611973, 1.73999, 0.990012, 0.064105, 0.182287, 0.748683, 0.34696, 0.361819, 1.338132, -12.953694999999998, 6.472279, 0.248862, 0.400547, 0.098369], [2.139501, 0.578987, 2.000679, 0.42586, 1.14348, 1.080136, 0.604545, 0.129836, 0.584262, 1.033739, 0.302936, 1.136863, 2.020366, 0.165001, 0.571468, 6.472279, -20.294897, 0.140825, 0.245841, 2.188158], [0.180717, 0.593607, 0.045376, 0.02989, 0.670128, 0.236199, 0.077852, 0.268491, 0.597054, 0.11166, 0.619632, 0.049906, 0.696175, 2.457121, 0.095131, 0.248862, 0.140825, -19.666723, 3.151815, 0.18951], [0.218959, 0.31444, 0.612025, 0.135107, 1.165532, 0.257336, 0.120037, 0.054679, 5.306834, 0.232523, 0.299648, 0.131932, 0.481306, 7.803902, 0.089613, 0.400547, 0.245841, 3.151815, -20.084989000000004, 0.249313], [2.54787, 0.170887, 0.083688, 0.037967, 1.959291, 0.210332, 0.245034, 0.076701, 0.119013, 10.649107, 1.702745, 0.185202, 1.898718, 0.654683, 0.296501, 0.098369, 2.188158, 0.18951, 0.249313, -18.608258]] )
[docs]def loglikelihood_substitution(t, Q, eq_freq, alphabet, tandem_repeat): """ Calculate the likelihood of a repeat assuming a star tree and a sequence model defined by ``Q``, ``t``, ``eq_freq`` and ``alphabet``. Args: t (float): The divergence of the ``tandem_repeat`` units. Q (dict ?): A substitution matrix. eq_freq (dict ?): Equilibrium frequencies of all chars. alphabet (dict of str,?): An alphabet from a substitution matrix. tandem_repeat (Repeat): An instance of the ``Repeat`` class. Returns: float: The loglikelihood value for the tandem repeat. """ P = scipy.linalg.expm(Q * t) # msaTDN is a list of tandem_repeat.n numpy arrays. # Each numpy array represents the elements of the alphabet. The entries are # the count of an alphabet element in the respective tandem_repeat column if not hasattr(tandem_repeat, 'msaTDN'): tandem_repeat.msaTDN = [] for column in tandem_repeat.msaTD_standard_aa: column = column.replace("-", "") my_diag = {i: 0 for i in range(len(alphabet.keys()))} for iN in column: my_diag[alphabet[iN]] = my_diag[alphabet[iN]] + 1 tandem_repeat.msaTDN.append( np.array([j for j in my_diag.values()])) # First sum: Over all sites # Second sum: Over all possible ancestral characters # Third product: Over all repeat units likelihood = sum(np.log(sum(iJ * np.product(np.power(P[i], column)) for i, iJ in enumerate(eq_freq))) for column in tandem_repeat.msaTDN) return likelihood
[docs]def loglikelihood_gaps_starphylogeny_zipfian( t, tandem_repeat, indel_rate_per_site=CONFIG['indel'].as_float('indel_rate_per_site'), gaps=CONFIG['indel']['gaps'], indel_zipf=CONFIG['indel'].as_float('zipf')): """ Calculate the log-likelihood of the gap structure in a ``tandem repeat`` assuming a star tree. Calculate the log-likelihood of the gap structure in a ``tandem repeat`` assuming a star tree (i.e. repeat unit correlations are not modeled). The gap length model is a zipfian distribution with parameter ``indel_zipf``. The gap generation model is a simple poisson model with rate parameter ``indel_rate_per_site``. Args: t (float): The divergence of the ``tandem_repeat`` units. tandem_repeat (Repeat): An instance of the ``Repeat`` class. indel_rate_per_site (float): The mutation rate of insertions and deletions, as opposed to ``t``. gaps (str): The mode of gap counting. Options are "row_wise", "ignore_coherent_deletions", "ignore_trailing_gaps" and "ignore_trailing_gaps_and_coherent_deletions". indel_zipf (float): The parameter of the Zipfian distribution. Returns: float: The loglikelihood value for the gap structure of the tandem repeat. .. todo:: USE LOG EARLIER, NOT JUST IN RETURN STATEMENT """ insertions = tandem_repeat.insertions deletions = tandem_repeat.deletions[gaps] gaps = tandem_repeat.gaps[gaps] # Calculate likelihood of the gap lengths with insertions and deletions # combined. # Here, we use the Zipfian distributions l_length = scipy.special.zeta(indel_zipf, 1) ** - len(gaps) for gap in gaps: l_length = l_length * (gap ** -indel_zipf) # Calculate the likelihood for the ratio of columns were an insertion has # taken place, as opposed to the number of columns, where no insertion has # taken place. Do the same for deletions. # Here, we use the Poisson model probability_gap_per_site = t * indel_rate_per_site / 2 l_insertions = scipy.stats.distributions.binom.pmf( len(insertions), tandem_repeat.l_effective * tandem_repeat.n + 1, probability_gap_per_site) l_deletions = \ scipy.stats.distributions.binom.pmf(len(deletions), tandem_repeat.l_effective * tandem_repeat.n, probability_gap_per_site) # Return the total likelihood of tandem_repeat's gap structure: return np.log(l_length * l_insertions * l_deletions)
def loglikelihood_substitutions_gaps(t, substitution_args, gap_args): return loglikelihood_substitution(t, *substitution_args) + \ loglikelihood_gaps_starphylogeny_zipfian(t, *gap_args) def loglikelihood_startopology_local( tandem_repeat, evolution_model=CONFIG['evolutionary_model'], gaps=CONFIG['indel']['gaps'], indel_rate_per_site=CONFIG['indel'].as_float('indel_rate_per_site'), parameters=False): if tandem_repeat.l_effective == 0: return 100, -100 if parameters: Q, eq_freq, alphabet = parameters else: Q, eq_freq, alphabet = load_model(evolution_model) if gaps: divergence, loglikelihood_duplication_history = \ optimisation(function=loglikelihood_substitutions_gaps, args=[[Q, eq_freq, alphabet, tandem_repeat], [tandem_repeat, indel_rate_per_site, gaps]]) else: divergence, loglikelihood_duplication_history = \ optimisation(function=loglikelihood_substitution, args=[Q, eq_freq, alphabet, tandem_repeat]) return divergence, loglikelihood_duplication_history def loglikelihood_random(repeat, evolution_model=CONFIG['evolutionary_model'], parameters=False): if repeat.sequence_type == 'AA': if parameters: equilibrium_freq = parameters else: aarate_file_name = config_file( "data", "substitution_rate_matrices", evolution_model + ".dat") # load equilibrium frequencies equilibrium_freq = load_equilibrium_freq(aarate_file_name) loglikelihoodrandom = 0. for aa, freq in equilibrium_freq.items(): # np.log uses the natural logarithm. loglikelihoodrandom += repeat.textD_standard_aa.count( aa) * np.log(float(freq)) elif repeat.sequence_type == 'DNA': # CHECK THIS! # At the moment, we assume equal frequencies for each nucleotide - # Baseml Model J69 or K80 # python2: float for division neccesary # np.log uses the natural logarithm. loglikelihoodrandom = repeat.totD * np.log(0.25) return loglikelihoodrandom def phylo_star_topology_local(tandem_repeat, evolution_model=False, gaps=False, indel_rate_per_site=0.001, parameters=False): if not evolution_model: if tandem_repeat.sequence_type == 'AA': evolution_model = 'lg' else: evolution_model = 'k80' divergence, logLH1 = \ loglikelihood_startopology_local(tandem_repeat=tandem_repeat, evolution_model=evolution_model, gaps=gaps, indel_rate_per_site=indel_rate_per_site, parameters=parameters) logLH0 = loglikelihood_random( repeat=tandem_repeat, evolution_model=evolution_model) return divergence, 2 * (logLH1 - logLH0) # ####### TANDEM REPEAT SCORE CALIBRATION ################################## def calc_score( repeats, result_path, result_file_name, scoreslist=CONFIG['score_calibration'].as_list('scoreslist'), save_calibration=CONFIG['score_calibration'].as_bool('save_calibration'), precision=CONFIG['score_calibration'].as_int('precision')): sequence_type = repeats[0].sequence_type if repeats[0].sequence_type == 'AA': evolution_model = 'lg' else: evolution_model = 'k80' Q, eq_freq, alphabet = load_model(evolution_model=evolution_model) parameters = [Q, eq_freq, alphabet] if sequence_type == 'AA': aarate_file_name = config_file( "data", "substitution_rate_matrices", evolution_model + ".dat") # load equilibrium frequencies equilibrium_freq = load_equilibrium_freq(aarate_file_name) else: equilibrium_freq = {i: 0.25 for i in ['A', 'C', 'G', 'T']} random = [ loglikelihood_random( repeat=iRepeat, evolution_model=evolution_model, parameters=equilibrium_freq) for iRepeat in repeats] for i_score in scoreslist: if i_score == 'phylo': test_statistic = [ loglikelihood_startopology_local( iRepeat, gaps=False, parameters=parameters)[1] for iRepeat in repeats] test_statistic = [2 * (iT - iR) if iT != - 1 else - 1000 for iT, iR in zip(test_statistic, random)] elif i_score == 'phylo_gap01': test_statistic = \ [loglikelihood_startopology_local(iRepeat, gaps=True, indel_rate_per_site=0.01, parameters=parameters)[1] for iRepeat in repeats] test_statistic = [2 * (iT - iR) if iT != - 1 else - 1000 for iT, iR in zip(test_statistic, random)] elif i_score == 'phylo_gap001': test_statistic = \ [loglikelihood_startopology_local(iRepeat, gaps=True, indel_rate_per_site=0.001, parameters=parameters)[1] for iRepeat in repeats] test_statistic = [2 * (iT - iR) if iT != - 1 else - 1000 for iT, iR in zip(test_statistic, random)] elif i_score == 'phylo_paml': # Is this defined anywhere? test_statistic = [loglikelihood_starTopology_paml(iRepeat) for iRepeat in repeats] test_statistic = [2 * (iT - iR) if iT != - 1 else - 1000 for iT, iR in zip(test_statistic, random)] elif i_score == 'parsimony': test_statistic = [np.round_(mean_similarity(iRepeat, parsimony), decimals=precision) if iRepeat.l_effective != 0 else -1 for iRepeat in repeats] elif i_score == 'pSim': test_statistic = [np.round_(mean_similarity(iRepeat, pSim), decimals=precision) if iRepeat.l_effective != 0 else -1 for iRepeat in repeats] elif i_score == 'entropy': test_statistic = [np.round_(mean_similarity(iRepeat, entropy), decimals=precision) if iRepeat.l_effective != 0 else -1 for iRepeat in repeats] if save_calibration: file_name = os.path.join(result_path, i_score, result_file_name) print(file_name) np.savez(file_name, np.sort(test_statistic)) else: if not hasattr(repeats[0], 'score'): for iR in repeats: iR.score = {} for iR, iT in zip(repeats, test_statistic): iR.dScore[i_score] = iT def calibrate_score( repeats, result_file_path, scoreslist=CONFIG['score_calibration'].as_list('scoreslist')): for i_score in scoreslist: test_statistic = \ np.sort([iRepeat.dScore[i_score] for iRepeat in repeats]) np.savez(os.path.join('_'.join([result_file_path, i_score])), test_statistic) # ####### CALCULATE AND SAVE THE PDF OF SCORES (Potentially deprecated) ### def save_distribution(values, items, result_file_path, file_name, inverse): for iC in range(len(items)): val = np.array([i[iC] for i in values]) result_val, inv_ndx = np.unique(val, return_inverse=True) result_p = np.bincount(inv_ndx) result_p = result_p / float(sum(result_p)) if inverse and items[iC] in [ 'phylo', 'pSim']: # Are high values the best? result_p = result_p[::-1] result_val = result_val[::-1] result_p = result_p.cumsum() if not os.path.isdir(os.path.join(result_file_path, items[iC])): os.makedirs(os.path.join(result_file_path, items[iC])) np.savez(os.path.join(result_file_path, items[iC], file_name), cdf=result_p, value=result_val)
[docs]def calculate_pdf_scores( repeats, result_file_path, file_name, scoreslist=CONFIG['score_calibration'].as_list('scoreslist')): """ CALCULATE THE PROBABILITY DISTRIBUTION OF SCORES """ # Can you generalise the next command for arbitrary classifiers? scores = [ [repeat.score(scoreslist[0]), repeat.score(scoreslist[1])] for repeat in repeats] save_distribution( values=scores, items=scoreslist, result_file_path=result_file_path, file_name=file_name, inverse=True)