Source code for tral.repeat.repeat_io

# (C) 2015 Elke Schaper

"""
    :synopsis: Input/output for tandem repeats.

    .. moduleauthor:: Elke Schaper <elke.schaper@isb-sib.ch>
"""

import Bio.Seq
import random
import logging
import itertools
import os
import re
import shutil
import tempfile
import subprocess
import numpy as np
from tral.repeat import repeat
from tral import configuration

from tral.paths import config_file
CONFIG_GENERAL = configuration.Configuration.instance().config
REPEAT_CONFIG = CONFIG_GENERAL["repeat"]

LOG = logging.getLogger(__name__)


###################### SAVE REPEAT #######################################

[docs]def save_repeat_fasta(tandem_repeats, file): """ save multiple <tandem_repeats> in Fasta format in specified <file> At current, only one TR per sequence can be defined, as the identifiers in the dict <tandem_repeats> must be unique. Parameters: Dict of tandem repeats and identifiers. e.g. {'ENSP00012': msa1, 'ENSP00013': msa2} >ID GHKI GHKI GH-- """ with open(file, 'w', newline='\n') as f: for identifier, msa in tandem_repeats.items(): f.write(">{0}\n".format(identifier)) f.write("\n".join(msa) + "\n\n")
[docs]def save_repeat_stockholm(tandem_repeat, file): """ save <tandem_repeat> in STOCKHOLM format in specified <file> Parameters: Tandem repeat MSA. e.g. ["ACDEF-", "ACCDEF"] More information in ftp://selab.janelia.org/pub/software/hmmer3/3.0/Userguide.pdf STOCKHOLM Format Example: # STOCKHOLM 1.0 seq1 ACDEF...GHIKL seq2 ACDEF...GHIKL seq3 ...EFMNRGHIKL seq1 MNPQTVWY seq2 MNPQTVWY seq3 MNPQT... // """ with open(file, 'w', newline='\n') as fh: fh.write("# STOCKHOLM 1.0\n") for i, iMSA in enumerate(tandem_repeat): fh.write("{} {}\n".format(str(i), iMSA)) fh.write("//")
[docs]def save_repeat_treks(tandem_repeats, file): """ Save multiple `tandem_repeats` in T-REKS format in specified `file` At current, only one TR per sequence can be defined, as the identifiers in the dict <tandem_repeats> must be unique. Parameters: Dict of tandem repeats and identifiers. e.g. {'ENSP00012': [msa1, begin1], 'ENSP00013': [msa2, begin2]} T-REKS format example: >a Length: 3 residues - nb: 3 from 1 to 10 - Psim:0.8076923076923077 region Length:42 GHKI GHKI GH-- ********************** Length: 3 residues - nb: 2 from 21 to 27 - Psim:0.7857142857142857 region Length:22 GHKI GH-- ********************** >b Length: 3 residues - nb: 3 from 1 to 10 - Psim:0.8095238095238095 region Length:20 UIR UIR UIR """ with open(file, 'w', newline='\n') as fh: for identifier, info in tandem_repeats.items(): fh.write(">{0}\n".format(identifier)) fh.write("Length: from {0} to\n".format(str(info[1]))) fh.write("\n".join(info[0]) + "\n" + "*" * 22 + "\n\n")
############################## READ REPEAT SEQUENCE ######################
[docs]def read_fasta(seq_filename, sequence_type='AA'): """ Read repeat from file in fasta format. Read repeat from file in fasta format. Args: seq_filename (str): Path to the repeats containing fasta file sequence_type (str): Either "AA" or "DNA" Returns: (list of Repeat): A list of Repeat instances. """ pat_start = re.compile(r">(.*)") pat_repeat_unit = re.compile(r"([\w\.\-]+)") # Our possible parser states: # # 1: searching for sequence name # 2: searching for repeat units repeats = {} state = 1 with open(seq_filename, "rt") as infile: for i, line in enumerate(infile): LOG.debug("Line {0}: {1}".format(i, line[0:-1])) if 1 == state: match = pat_start.match(line) if match: LOG.debug(" * (1->2) Found start") LOG.debug("Start: %s", match.group(1)) name = match.group(1) repeats[name] = [] state = 2 elif 2 == state: match = pat_repeat_unit.match(line) if match: LOG.debug(" * (2->2) Found Repeat unit") LOG.debug("Repeat Unit: %s", match.group(1)) repeat_unit = match.group(1) repeats[name].append(repeat_unit.replace(".", "-").upper()) else: LOG.debug(" * (2->1) Found NO further Repeat unit") state = 1 for i_name, iR in repeats.items(): yield iR, sequence_type
################################### SIMULATE SEQUENCE ####################
[docs]def evolved_tandem_repeats(l, n, n_samples, sequence_type, job_id='job_id', mutation_rate=50, tree='star', indel_rate_per_site=False, return_type='repeat'): """ Simulate evolved sequences with ALF. Simulate evolved sequences with ALF: Dalquen, D. A., Anisimova, M., Gonnet, G. H. & Dessimoz, C. ALF--a simulation framework for genome evolution. Molecular Biology and Evolution 29, 1115–1123 (2012). Args: l (int): The length of the repeat unit n (int): The number of repeat units in the tandem repeat n_samples (int): The number of samples sequence_type (str): Either "AA" or "DNA" job_id (str): A tag for files produces with ALF, and result files. mutation_rate (float): The mutation rate. tree (str): The type of tree, e.g. "star" or "birthdeath" indel_rate_per_site (int or False): The indel rate per site. return_type (str): Either "repeat" or "list" sequence_length (int): The total length of the simulated sequence Returns: Return type depends on ``return_type``: ``Repeat`` or ``Bio.Seq.Seq`` instance. """ runfile_template = config_file("data", "ALF", "template.drw") alf_exec = REPEAT_CONFIG['alfsim'] # create temporary directory working_dir = tempfile.mkdtemp() LOG.debug("evolvedTR: Created tempfile: %s", working_dir) # create working dir if not os.path.isdir(working_dir): os.makedirs(working_dir) # Copy template file and append job specific info runfilename = "alf.drw" shutil.copyfile(runfile_template, os.path.join(working_dir, runfilename)) with open(os.path.join(working_dir, runfilename), "a") as runfile: if indel_rate_per_site: # insertion rate per site and PAM. E.g. for PAM=40 expect # aaGainRate*40 insertions per site. runfile.write( "aaGainRate := " + str(indel_rate_per_site / mutation_rate) + ";\n") runfile.write( "aaLossRate := " + str(indel_rate_per_site / mutation_rate) + ";\n") runfile.write("maxIndelLength := 50;\n") runfile.write("indelModel := 'ZIPF';\n") runfile.write("Z_c := 1.821;\n") runfile.write("DawgPlacement := true;\n") runfile.write("uuid := '" + job_id + "';\n") runfile.write("mname := '" + job_id + "';\n") runfile.write("protStart := " + str(n_samples) + ";\n") runfile.write("NSpecies := " + str(n) + ";\n") runfile.write("minGeneLength := " + str(l) + ";\n") runfile.write("mutRate := " + str(mutation_rate) + ";\n") runfile.write("wdir := '" + working_dir + "';\n") tree_length = n * mutation_rate # parameters concerning the species tree if tree == 'star': # BDTree, ToLSample, Custom runfile.write("treeType := 'Custom':\n") runfile.write( "tree := MakeStarTree(BirthDeathTree(0.1,0.1," + str(n) + ",10),mutRate):\n") runfile.write("treeLength:= %d ;\n" % tree_length) else: # BDTree, ToLSample, Custom runfile.write("treeType := 'BDTree':\n") # From the last discussion with Daniel, only the ration of birth to # death rate matters, if we scale tree to match pam distance runfile.write("birthRate := 0.01:\n") runfile.write("deathRate := 0.01:\n") # for BDTree: should resulting tree be ultrametric, e.g. all leaves # have same distance to origin? runfile.write("ultrametric := false:\n") runfile.write("treeLength:= %d ;\n" % tree_length) # DANIEL: Is a tree := missing? if tree not in {'star', 'birthdeath'}: LOG.warning( "evolvedTR: tree input %s not known, assuming birthdeath tree", tree) # parameters concerning the substitution models if sequence_type == 'AA': runfile.write( "substModels := [SubstitutionModel('CustomP', ['" + config_file('data', 'ALF', 'lg.dat') + "'])];\n") # CHECK! DOES # substModels := [SubstitutionModel('LG')]; # WORK? elif sequence_type == 'DNA': runfile.write( "substModels := [SubstitutionModel('TN93', [.3, .4, .7],[seq(0.25,4)], true)]:\n") # CHECK! DO THE EMPIRICAL PARAMETERS MAKE SENSE AT ALL? # compare to http://people.inf.ethz.ch/ddalquen/alf/ALF_manual.pdf else: # CODONS runfile.write("substModels := [SubstitutionModel('CPAM')];\n") # Determine ALF MSA output file path if sequence_type == 'AA': infile = os.path.join(working_dir, job_id, "MSA", "MSA_all_aa.phy") elif sequence_type == 'DNA': infile = os.path.join(working_dir, job_id, "MSA", "MSA_all_dna.phy") else: # CODONS infile = os.path.join( working_dir, job_id, "MSA", "MSA_all_codon.phy") for i in range(10): with open(os.path.join(working_dir, "out.txt"), "w") as outfile: alf_process = subprocess.Popen([alf_exec, runfilename], cwd=working_dir, stdout=outfile, stderr=outfile, close_fds=True) alf_process.wait() if os.path.isfile(infile): break else: LOG.error('ALFSIM was not able to produce simulated sequence.') LOG.error('Do you have ALFSIM installed and used the correct path in the configuration file?') return # shutil.rmtree('/cluster/home/infk/eschaper/spielwiese/') #shutil.copytree(working_dir, '/cluster/home/infk/eschaper/spielwiese/') """ Read in MSA data from ALF The following is a short parser for ALFsim output files yielding MSAs as lists of strings, based on a special flavour of Felstein stein MSA files """ # find a repeat unit pattern_start = re.compile(r"\d+ \d+") pattern_seq = re.compile(r"\S+[ ]+([A-Z\-]+)") # Our possible parser states: # # state 1: Find beginning of MSA (Felsenstein) # state 2: Find all repeat units # protein ::= # \d \d # \S/\S repeatunits # state = 1 with open(infile, "r") as infile: for i, line in enumerate(infile): LOG.debug("Line %d: %s", i, line[0:-1]) if 1 == state: # Find first repeat unit & save begin search = pattern_start.search(line) if search: LOG.debug(" *(1->2) Found MSA start") state = 2 msa = [] elif 2 == state: # Find all repeat units search = pattern_seq.search(line) if search: LOG.debug(" *(2->2) Found another repeat unit") msa.append(search.group(1)) else: LOG.debug(" *(2->1) repeat region finished, yielding.") state = 1 # YIELD IF WE HAVE FOUND AT LEAST TWO REPEAT UNITS: if len(msa) > 1: if return_type == 'repeat': yield repeat.Repeat(begin=0, msa=msa, sequence_type=sequence_type) elif return_type == 'list': yield msa else: LOG.debug("YIELD: %s", "".join(msa).replace('-', '')) yield Bio.Seq.Seq("".join(msa).replace('-', ''), sequence_type) # delete temporary directory try: shutil.rmtree(working_dir) except OSError: pass
[docs]def random_sequence(n_samples, sequence_type='AA', return_type='repeat', equilibrium_frequencies='human', l=0, n=0, sequence_length=0): """ Simulate random sequence locally. Simulate random sequence locally. Args: n_samples (int): The number of samples sequence_type (str): Either "AA" or "DNA" return_type (str): Either "repeat" or "list" equilibrium_frequencies (str): Only "human" option available at current l (int): The length of the repeat unit n (int): The number of repeat units in the tandem repeat sequence_length (int): The total length of the simulated sequence Returns: Return type depends on ``return_type``. """ if sequence_length == 0 and (l == 0 or n == 0): LOG.error('The specified sequence_length or the product of l and n was' ' set to 0 for random_sequence simulation') else: if sequence_length == 0: sequence_length = l * n # assert equilibrium_frequencies == 'human' file = config_file('data', 'Random', "_".join( [sequence_type, equilibrium_frequencies, '3']) + '.txt') with open(file, 'r') as f: a = f.readline()[:-1] b = [i.split(' ') for i in a.split(' ')] frequencies = {i[0]: int(i[1]) for i in b} alphabet = np.unique([i[0] for i in frequencies.keys()]) for _ in range(n_samples): seed_int = random.randint(1, sum(frequencies.values())) for key, value in frequencies.items(): seed_int -= value if seed_int <= 0: seed = key break dimer = [''.join(i) for i in itertools.product(alphabet, repeat=2)] third_letter_frequencies = { iD: { iA: frequencies[ iD + iA] for iA in alphabet} for iD in dimer} sequence = seed for _ in range(sequence_length - 3): next_int = random.randint( 1, sum(third_letter_frequencies[sequence[-2:]].values())) for key, value in third_letter_frequencies[ sequence[-2:]].items(): next_int -= value if next_int <= 0: sequence += key break # return Repeat instances if return_type == 'repeat' and not l == 0 and not n == 0: yield [sequence[i * l:(i + 1) * l] for i in range(n)], sequence_type elif return_type == 'list': yield sequence else: # return seqIO instances yield Bio.Seq.Seq(sequence, sequence_type)