Source code for strvcf_annotator.core.repeat_utils

"""Utilities for repeat sequence operations."""

from typing import Dict, Tuple, Union

import pandas as pd


[docs] def extract_repeat_sequence(str_row: Union[Dict, pd.Series]) -> str: """Reconstruct repeat sequence from STR metadata. Generates the full repeat sequence by repeating the repeat unit (RU) the calculated number of times (COUNT). Parameters ---------- str_row : Dict or pd.Series STR region data containing 'RU' (repeat unit) and 'COUNT' (number of repeats) Returns ------- str Full repeat sequence Examples -------- >>> str_row = {'RU': 'CAG', 'COUNT': 5} >>> extract_repeat_sequence(str_row) 'CAGCAGCAGCAGCAG' """ return str_row["RU"] * int(str_row["COUNT"])
[docs] def count_repeat_units(sequence: str, motif: str) -> int: """Return the longest contiguous run of `motif` in `sequence`. The function looks for exact, non-overlapping copies of `motif` that occur consecutively and returns the maximum number of such copies in any run. This corresponds to how STR repeat counts are typically defined: the length of the longest perfect contiguous block of the repeat unit. Parameters ---------- sequence : str DNA sequence to search. motif : str Repeat unit motif to count (e.g. 'A', 'CAG'). Returns ------- int Length of the longest contiguous run of `motif` in `sequence`. Raises ------ ValueError If `motif` is empty or if either argument is not a string. Examples -------- Perfect repeats ~~~~~~~~~~~~~~~ >>> count_repeat_units("CAGCAGCAG", "CAG") 3 Imperfect tail ~~~~~~~~~~~~~~ >>> count_repeat_units("CAGCAGCA", "CAG") 2 No repeats ~~~~~~~~~~ >>> count_repeat_units("ATCG", "CAG") 0 Homopolymer runs ~~~~~~~~~~~~~~~~ >>> count_repeat_units("ATAAAAA", "A") 5 >>> count_repeat_units("AAAATAAA", "A") 4 # longest contiguous run is 'AAAA' Overlapping motifs ~~~~~~~~~~~~~~~~~~ 'AAAA' with motif 'AA' contains two non-overlapping copies: 'AA' 'AA' >>> count_repeat_units("AAAA", "AA") 2 """ # Basic type and value validation if not isinstance(sequence, str): raise ValueError(f"'sequence' must be a string, got {type(sequence).__name__}") if not isinstance(motif, str): raise ValueError(f"'motif' must be a string, got {type(motif).__name__}") if motif == "": raise ValueError("'motif' must be a non-empty string") mlen = len(motif) slen = len(sequence) if mlen > slen or slen == 0: return 0 max_run = 0 i = 0 while i <= slen - mlen: # Check if a run starts at position i if sequence[i : i + mlen] == motif: run_length = 0 j = i # Count how many contiguous motif copies we have from position i while j <= slen - mlen and sequence[j : j + mlen] == motif: run_length += 1 j += mlen if run_length > max_run: max_run = run_length # Skip past this entire run i = j else: i += 1 return max_run
[docs] def normalize_variant(pos: int, ref: str, alt: str) -> Tuple[int, str, str]: """ Locally normalize (pos, ref, alt) by trimming shared prefix/suffix. - pos is 1-based VCF coordinate. - Trimming is case-insensitive. - We always keep at least 1 base in ref and alt if they differ. Returns ------- new_pos, new_ref, new_alt """ # early exit: identical → no-op if ref.upper() == alt.upper(): return pos, ref, alt r = ref a = alt # trim common prefix while len(r) > 1 and len(a) > 1 and r[0].upper() == a[0].upper(): r = r[1:] a = a[1:] pos += 1 # trim common suffix while len(r) > 1 and len(a) > 1 and r[-1].upper() == a[-1].upper(): r = r[:-1] a = a[:-1] return pos, r, a
[docs] def apply_variant_to_repeat( pos: int, ref: str, alt: str, repeat_start: int, repeat_seq: str ) -> str: """ Apply a variant to an STR repeat sequence. The function applies a VCF variant to a reference STR sequence while respecting VCF normalization rules and STR boundaries. The algorithm works as follows: 1. Normalize ``pos``, ``ref``, and ``alt`` by trimming shared prefixes and suffixes. 2. If the normalized variant lies fully inside the STR region, apply the full ``ALT`` allele. 3. If the variant partially overlaps the STR region: - **SNP-like** variants (``len(ref) == len(alt)``) are aligned positionally. - **Indel-like** variants (``len(ref) != len(alt)``) use the suffix of ``ALT`` that overlaps the STR. Any parts of the variant outside the STR window are ignored. Notes ----- The genomic reference is conceptually treated as:: repeat_seq + UNKNOWN_SUFFIX Differences outside the STR window do not affect the resulting mutated STR sequence. Case handling rules: - All matching and overlap logic is case-insensitive. - Output case follows the overlapping STR segment: - lowercase STR slice → lowercase ALT - uppercase STR slice → uppercase ALT - mixed case → ALT is left unchanged Parameters ---------- pos : int Variant position (1-based VCF coordinate). ref : str Reference allele from the VCF record. alt : str Alternate allele from the VCF record. repeat_start : int Start position of the STR region (1-based). repeat_seq : str Reference STR sequence from the panel. Returns ------- str The mutated STR sequence after applying the variant. If the variant does not overlap the STR region, the original ``repeat_seq`` is returned unchanged. """ repeat_len = len(repeat_seq) # --- 1) Normalize variant locally --- pos, ref, alt = normalize_variant(pos, ref, alt) # After normalization, if ref == alt, nothing to do for the STR if ref.upper() == alt.upper(): return repeat_seq repeat_end = repeat_start + repeat_len - 1 var_start = pos var_end = pos + len(ref) - 1 # inclusive # --- 2) No overlap with STR --- if var_end < repeat_start or var_start > repeat_end: return repeat_seq # --- 3) Variant fully inside STR: apply full ALT (insertions/deletions kept) --- if var_start >= repeat_start and var_end <= repeat_end: relative_pos = var_start - repeat_start # 0-based offset within repeat_seq before = repeat_seq[:relative_pos] after = repeat_seq[relative_pos + len(ref) :] panel_slice = repeat_seq[relative_pos : relative_pos + len(ref)] # Case-normalize ALT to match panel_slice alt_adj = alt if panel_slice.islower(): alt_adj = alt.lower() elif panel_slice.isupper(): alt_adj = alt.upper() return before + alt_adj + after # --- 4) Partial overlap with STR (starts before and/or ends after) --- # Compute overlap region in reference coordinates. overlap_start = max(repeat_start, var_start) overlap_end = min(repeat_end, var_end) overlap_len = overlap_end - overlap_start + 1 # Position inside the STR where overlap begins (0-based) relative_pos = overlap_start - repeat_start # Extract ALT substring corresponding to overlapping bases. if len(ref) == len(alt): # SNP-like: align by position (same offset as ref) i0 = overlap_start - var_start alt_overlap_raw = alt[i0 : i0 + overlap_len] else: # Indel-like: align from the end of ALT so that the last base of ALT # corresponds to the last overlapping base of REF. # ALT shorter or equal: use entire ALT # Use only the suffix of ALT that covers the overlapping segment alt_overlap_raw = alt if overlap_len >= len(alt) else alt[-overlap_len:] panel_slice = repeat_seq[relative_pos : relative_pos + overlap_len] # Case-normalize ALT overlap to match panel_slice if panel_slice.islower(): alt_overlap = alt_overlap_raw.lower() elif panel_slice.isupper(): alt_overlap = alt_overlap_raw.upper() else: alt_overlap = alt_overlap_raw before_mut = repeat_seq[:relative_pos] after_mut = repeat_seq[relative_pos + overlap_len :] mutated = before_mut + alt_overlap + after_mut return mutated
[docs] def is_perfect_repeat(sequence: str, motif: str) -> bool: """Check if sequence is a perfect repeat of the motif. A perfect repeat means the sequence consists entirely of exact copies of the motif with no interruptions or variations. Parameters ---------- sequence : str DNA sequence to check motif : str Repeat unit motif Returns ------- bool True if sequence is a perfect repeat, False otherwise Examples -------- >>> is_perfect_repeat('CAGCAGCAG', 'CAG') True >>> is_perfect_repeat('CAGCAGCA', 'CAG') False """ if not sequence or not motif: return False count = count_repeat_units(sequence, motif) return sequence == motif * count