Source code for tral.repeat.repeat_align

# (C) 2015 Elke Schaper


"""
    :synopsis: Alignment of tandem repeat units.

    .. moduleauthor:: Elke Schaper <elke.schaper@isb-sib.ch>
    .. moduleauthor:: Paulina Näf
"""

from tral import configuration
import logging
import os
import subprocess
import tempfile

log = logging.getLogger(__name__)

CONFIG_GENERAL = configuration.Configuration.instance().config
REPEAT_CONFIG = CONFIG_GENERAL["repeat"]


[docs]def realign_repeat(my_msa, realignment='mafft', sequence_type='AA', rate_distribution=REPEAT_CONFIG['castor_parameter']['rate_distribution'], user_path=None): """ Realignment of a repeat MSA using mafft or proPIP A good multiple sequence alignment (MSA) is an important description of a tandem repeat (TR) and can be crucial to distinguish a true positive from a false positive TR. This can be realised using a proper MSA estimation algorithm. Currently, Mafft and proPIP can be used to realign a TR. For proPIP the rate distribution for indels can be either constant or gamma distributed - REPEAT_CONFIG['castor_parameter']['rate_distribution'] == 'constant': no gamma distribution is used for the proPIP algorithm - REPEAT_CONFIG['castor_parameter']['rate_distribution'] == 'gamma': gamma distribution (n=6, alpha=0.5) Args: my_msa (list of strings): List of sequences (str) realignment (str): Either "mafft" or "proPIP" sequence_type (str): Either "AA" or "DNA" user_path (str): copy alignment files to user defined path Returns: my_msa realigned (list of strings) .. todo:: - decide what happens if branch length of tree is getting zero """ realignment_types = ['mafft', 'proPIP', None] if realignment not in realignment_types: raise ValueError("Invalid realignment option. Expected one of: {}".format(realignment_types)) if not all(isinstance(s, str) for s in my_msa): raise ValueError("Invalid MSA input.") if realignment == "proPIP": # remove columns that only contains gaps to avoid errors my_msa = remove_gaps(my_msa) # do try to realign if no gap is present in MSA if not any(['-' in s for s in my_msa]): print("\nproPIP needs at least one gap in the MSA to realign.\n" + "No realignment will be performed.\n") return my_msa if realignment is None: print("\nNo realignment was performed.") return my_msa # Create temporary working directory working_dir = tempfile.mkdtemp() log.debug("evolvedTR: Created temp directory: %s", working_dir) # Save my_TR to temp directory: msa_file = os.path.join(working_dir, 'initial_msa.faa') with open(msa_file, 'w') as msa_filehandle: for i, iMSA in enumerate(my_msa): msa_filehandle.write('>{0}\n{1}\n'.format(i + 1, iMSA)) if realignment == 'mafft': # Run Mafft # See http://mafft.cbrc.jp/alignment/software/manual/manual.html for choice of options. # The mafft result is in stdout. Check: Do you need to capture or # redirect the stderr? p = subprocess.Popen([REPEAT_CONFIG['ginsi'], "--anysymbol", "--quiet", msa_file], stdout=subprocess.PIPE) mafft_output = [line.decode('utf8').rstrip() for line in p.stdout] msa = [] label = [] for i, line in enumerate(mafft_output): if line[0] == '>': label.append(int(line[1:])) elif mafft_output[i - 1][0] == '>': msa.append(line) else: msa[-1] += line # use original order of msa msa_sorted = [x for _, x in sorted(zip(label, msa))] log.debug('\n'.join(msa_sorted)) p.wait() try: return msa except: error_note = ( "Mafft could not successfully run the realignment for: " "\n".join(my_msa)) logging.error(error_note) return None elif realignment == 'proPIP': # Run Castor (with integrated aligner) (https://github.com/acg-team/castor_aligner) # log messages of castor to stderr instead of logfiles os.environ["GLOG_logtostderr"] = "1" if sequence_type == "AA": alphabet = "Protein" substitution_model = "LG08" elif sequence_type == "DNA": alphabet = "DNA" substitution_model = "HKY85" else: raise ValueError("Sequence type is not known.") if rate_distribution == 'constant': rate_distribution = 'Constant' elif rate_distribution == 'gamma': rate_distribution = 'Gamma(n=6,alpha=0.5)' else: raise ValueError("Rate distribution parameter not known.") tree_initial = os.path.join(working_dir, "tree_initial.nwk") tree = os.path.join(working_dir, "tree.nwk") msa_realigned = os.path.join(working_dir, "msa_realigned.faa") paramsfile_tree = os.path.join(working_dir, 'params_tree.txt') paramsfile_alignment = os.path.join(working_dir, 'params_alignment.txt') estimates = os.path.join(working_dir, 'estimates.json') #################################### # Create an initial tree #################################### init_tree = "distance" # Castor cannot create trees for less than four sequences if len([1 for line in open(msa_file) if line.startswith(">")]) == 2: print("For two units which have to be aligned an arbritary starting tree will be given.") tree_string = "(1:0.1,2:0.1);" init_tree = "user" with open(tree_initial, 'w') as treefile: treefile.write(tree_string) # For three units an arbritary initial tree is used for the alignment if len([1 for line in open(msa_file) if line.startswith(">")]) == 3: print("For three units which have to be aligned an arbritary starting tree will be given.") tree_string = "((1:0.1,3:0.1):0.1,2:0.1);" init_tree = "user" with open(tree_initial, 'w') as treefile: treefile.write(tree_string) # For more than tree units an initial tree will be estimated from the previous alignment # create parameter file to create or optimize tree with castor parameters_tree = ["analysis_name=tree_optimization", "alphabet={}".format(alphabet), "alignment=false", "input.sequence.file={}".format(msa_file), "input.sequence.sites_to_use=all", "init.tree={}".format(init_tree), "input.tree.file={}".format(tree_initial), "init.distance.method=bionj", "model=PIP(model={}(initFreqs=observed),initFreqs=observed)".format(substitution_model), "rate_distribution=Constant", # creating tree alwas with constant rates to estimate lamda and mu from data "optimization=D-BFGS(derivatives=BFGS)", "optimization.max_number_f_eval=500", "optimization.tolerance=0.001", "optimization.final=bfgs", "optimization.topology=true", "optimization.topology.algorithm=Swap(coverage=nnr-search,starting_nodes=Hillclimbing(n=4),max_cycles=50,tolerance=0.01,brlen_optimisation=Brent,threads=10)", "output.tree.file={}".format(tree), "output.tree.format=Newick", "output.estimates.format=json", "output.estimates.file={}".format(estimates), "support=none"] try: with open(paramsfile_tree, 'w') as params: for parameter in parameters_tree: params.write(parameter + '\n') except: print("A problem occurred while trying to write alignment parameters to txt file") # run castor for tree initialization try: castor_tree_initialization = subprocess.Popen([REPEAT_CONFIG['Castor'], "params={}".format(paramsfile_tree)], stdout=subprocess.PIPE, stderr=subprocess.PIPE) print("Tree initialization with Castor ....") castor_tree_initialization.wait() except FileNotFoundError: error_note = ( "ProPIP could not be reached.\n" + "Is Castor installed properly and is the path defined correctly in config.ini in the data directory?\n") logging.error(error_note) raise Exception("Sorry, creating a tree for the alignment went wrong.") ################### # Use proPIP algorithm to align TR units with inferred tree ################### # create file with unaligned sequences unaligned_sequences = os.path.join(working_dir, "sequences.faa") try: with open(msa_file, 'r') as infile, open(unaligned_sequences, 'w') as outfile: temp = infile.read().replace("-", "") outfile.write(temp) except: print("A problem occurred reading initial alignment file.") # parse estimates file to use calculated indel parameters try: import json with open(estimates) as estimates_file: est = json.load(estimates_file) mu_estimated = est['Model']['PIP']['mu'] lambda_estimated = est['Model']['PIP']['lambda'] indel_parameters = ',lambda={},mu={}'.format(lambda_estimated, mu_estimated) except FileNotFoundError as error: print('\nNo file with parameter estimates found.') print('Probably it went something wrong when trying to infer or optimise the phylogenetic tree.') logging.error(error) raise # TODO: proPIP cannot produce an alignment when a branchlength is zero. # Need to catch this before an error is thrown # Maybe replace zero with a very small number? # Is this scenario even possible after tree calculation with castor? # create parameter file for alignment with proPIP parameters_alignment = ["analysis_name=aligner", "model_description={}+PIP".format(substitution_model), "alphabet={}".format(alphabet), "alignment=true", "alignment.version=ram", "input.sequence.file={}".format(unaligned_sequences), "input.sequence.sites_to_use=all", "init.tree=user", "init.distance.method=bionj", "input.tree.file={}".format(tree), "model=PIP(model={}{})".format(substitution_model, indel_parameters), "rate_distribution={}".format(rate_distribution), "optimization=None", "output.msa.file={}".format(msa_realigned), "support=none"] try: with open(paramsfile_alignment, 'w') as params: for parameter in parameters_alignment: params.write(parameter + '\n') except: print("A problem occurred while trying to write alignment parameters to txt file") # run alignment with propip try: proPIP_alignment = subprocess.Popen([REPEAT_CONFIG['Castor'], "params={}".format(paramsfile_alignment)], stdout=subprocess.PIPE, stderr=subprocess.PIPE) print("Realignment of MSA with proPIP .... ") proPIP_alignment.wait() except FileNotFoundError: error_note = ( "ProPIP algorithm could not be reached.\n" + "Is Castor (inkl. proPIP aligner) installed properly and is the path defined in config.ini in the data directory?\n") logging.error(error_note) return # read msa and return as sorted list try: # The created alignment file has "initial" included into the name because in future the tool should be able to realign with open(os.path.join(working_dir, "msa_realigned.initial.faa"), "r") as f: realigned = f.readlines() except FileNotFoundError: try: with open(msa_realigned, "r") as f: realigned = f.readlines() except FileNotFoundError: error_note = ( "ProPIP could not successfully be used for the realignment of:\n" + "\n".join(my_msa)) logging.error(error_note) return msa = [] label = [] for i, line in enumerate(realigned): if line[0] == '>': label.append(int(line[1:-1])) elif realigned[i - 1][0] == '>': msa.append(line[:-1]) else: msa[-1] += line[:-1] # use original order of msa msa_sorted = [x for _, x in sorted(zip(label, msa))] log.debug('\n'.join(msa_sorted)) ## copy alignment files to user defined path if user_path: if not os.path.exists(os.path.dirname(user_path)): os.makedirs(os.path.dirname(user_path)) import shutil mafft_path = user_path.split("proPIP")[0] + "mafft" with open(user_path, 'w') as out_msa: for i in range(len(msa_sorted)): out_msa.write(">{}\n".format(i + 1)) out_msa.write("{}\n".format(msa_sorted[i])) shutil.copy(msa_file, mafft_path) try: return msa_sorted except: raise RuntimeError( "ProPIP could not successfully be used for the realignment of:\n" + "\n".join(my_msa)) else: raise ValueError( 'Currently, the aligner {} is not implemented.'.format(realignment))
[docs]def remove_char(str, n): """ helper function to remove a character in a sequence """ first_part = str[:n] last_part = str[n + 1:] return first_part + last_part
[docs]def remove_gaps(msa): """ Remove columns in a MSA which contain only gaps. Do not use MSA longer than 3000 characters. Returns the MSA as a list without only-gap-containing columns. Args: msa (list of strings): List of sequences (str) Returns: msa (list of strings) with removed colums in case they only contain gaps """ if not all(isinstance(s, str) for s in msa): raise ValueError("Invalid MSA input.") i = 3000 # The MSA should have at maximum 3000 characters if not all(len(s) < i for s in msa): raise ValueError("Length of sequences in MSA need to be below {}.".format(i)) while i >= len(msa[0]): i -= 1 while i <= len(msa[0]) - 1 and not i < 0: column = [] for seq in msa: column.append(seq[i]) # print(column) if all([char == '-' for char in column]): col = 0 for seq in msa: seq = remove_char(seq, i) msa[col] = seq col += 1 i -= 1 return msa