"""Utilities for repeat sequence operations."""
from typing import Dict, Tuple, Union
import pandas as pd
[docs]
def count_repeat_units(sequence: str, motif: str) -> int:
"""Return the longest contiguous run of `motif` in `sequence`.
The function looks for exact, non-overlapping copies of `motif` that occur
consecutively and returns the maximum number of such copies in any run.
This corresponds to how STR repeat counts are typically defined: the
length of the longest perfect contiguous block of the repeat unit.
Parameters
----------
sequence : str
DNA sequence to search.
motif : str
Repeat unit motif to count (e.g. 'A', 'CAG').
Returns
-------
int
Length of the longest contiguous run of `motif` in `sequence`.
Raises
------
ValueError
If `motif` is empty or if either argument is not a string.
Examples
--------
Perfect repeats
~~~~~~~~~~~~~~~
>>> count_repeat_units("CAGCAGCAG", "CAG")
3
Imperfect tail
~~~~~~~~~~~~~~
>>> count_repeat_units("CAGCAGCA", "CAG")
2
No repeats
~~~~~~~~~~
>>> count_repeat_units("ATCG", "CAG")
0
Homopolymer runs
~~~~~~~~~~~~~~~~
>>> count_repeat_units("ATAAAAA", "A")
5
>>> count_repeat_units("AAAATAAA", "A")
4 # longest contiguous run is 'AAAA'
Overlapping motifs
~~~~~~~~~~~~~~~~~~
'AAAA' with motif 'AA' contains two non-overlapping copies: 'AA' 'AA'
>>> count_repeat_units("AAAA", "AA")
2
"""
# Basic type and value validation
if not isinstance(sequence, str):
raise ValueError(f"'sequence' must be a string, got {type(sequence).__name__}")
if not isinstance(motif, str):
raise ValueError(f"'motif' must be a string, got {type(motif).__name__}")
if motif == "":
raise ValueError("'motif' must be a non-empty string")
mlen = len(motif)
slen = len(sequence)
if mlen > slen or slen == 0:
return 0
max_run = 0
i = 0
while i <= slen - mlen:
# Check if a run starts at position i
if sequence[i : i + mlen] == motif:
run_length = 0
j = i
# Count how many contiguous motif copies we have from position i
while j <= slen - mlen and sequence[j : j + mlen] == motif:
run_length += 1
j += mlen
if run_length > max_run:
max_run = run_length
# Skip past this entire run
i = j
else:
i += 1
return max_run
[docs]
def normalize_variant(pos: int, ref: str, alt: str) -> Tuple[int, str, str]:
"""
Locally normalize (pos, ref, alt) by trimming shared prefix/suffix.
- pos is 1-based VCF coordinate.
- Trimming is case-insensitive.
- We always keep at least 1 base in ref and alt if they differ.
Returns
-------
new_pos, new_ref, new_alt
"""
# early exit: identical → no-op
if ref.upper() == alt.upper():
return pos, ref, alt
r = ref
a = alt
# trim common prefix
while len(r) > 1 and len(a) > 1 and r[0].upper() == a[0].upper():
r = r[1:]
a = a[1:]
pos += 1
# trim common suffix
while len(r) > 1 and len(a) > 1 and r[-1].upper() == a[-1].upper():
r = r[:-1]
a = a[:-1]
return pos, r, a
[docs]
def apply_variant_to_repeat(
pos: int, ref: str, alt: str, repeat_start: int, repeat_seq: str
) -> str:
"""
Apply a variant to an STR repeat sequence.
The function applies a VCF variant to a reference STR sequence while
respecting VCF normalization rules and STR boundaries.
The algorithm works as follows:
1. Normalize ``pos``, ``ref``, and ``alt`` by trimming shared
prefixes and suffixes.
2. If the normalized variant lies fully inside the STR region,
apply the full ``ALT`` allele.
3. If the variant partially overlaps the STR region:
- **SNP-like** variants (``len(ref) == len(alt)``) are aligned
positionally.
- **Indel-like** variants (``len(ref) != len(alt)``) use the
suffix of ``ALT`` that overlaps the STR.
Any parts of the variant outside the STR window are ignored.
Notes
-----
The genomic reference is conceptually treated as::
repeat_seq + UNKNOWN_SUFFIX
Differences outside the STR window do not affect the resulting
mutated STR sequence.
Case handling rules:
- All matching and overlap logic is case-insensitive.
- Output case follows the overlapping STR segment:
- lowercase STR slice → lowercase ALT
- uppercase STR slice → uppercase ALT
- mixed case → ALT is left unchanged
Parameters
----------
pos : int
Variant position (1-based VCF coordinate).
ref : str
Reference allele from the VCF record.
alt : str
Alternate allele from the VCF record.
repeat_start : int
Start position of the STR region (1-based).
repeat_seq : str
Reference STR sequence from the panel.
Returns
-------
str
The mutated STR sequence after applying the variant.
If the variant does not overlap the STR region, the original
``repeat_seq`` is returned unchanged.
"""
repeat_len = len(repeat_seq)
# --- 1) Normalize variant locally ---
pos, ref, alt = normalize_variant(pos, ref, alt)
# After normalization, if ref == alt, nothing to do for the STR
if ref.upper() == alt.upper():
return repeat_seq
repeat_end = repeat_start + repeat_len - 1
var_start = pos
var_end = pos + len(ref) - 1 # inclusive
# --- 2) No overlap with STR ---
if var_end < repeat_start or var_start > repeat_end:
return repeat_seq
# --- 3) Variant fully inside STR: apply full ALT (insertions/deletions kept) ---
if var_start >= repeat_start and var_end <= repeat_end:
relative_pos = var_start - repeat_start # 0-based offset within repeat_seq
before = repeat_seq[:relative_pos]
after = repeat_seq[relative_pos + len(ref) :]
panel_slice = repeat_seq[relative_pos : relative_pos + len(ref)]
# Case-normalize ALT to match panel_slice
alt_adj = alt
if panel_slice.islower():
alt_adj = alt.lower()
elif panel_slice.isupper():
alt_adj = alt.upper()
return before + alt_adj + after
# --- 4) Partial overlap with STR (starts before and/or ends after) ---
# Compute overlap region in reference coordinates.
overlap_start = max(repeat_start, var_start)
overlap_end = min(repeat_end, var_end)
overlap_len = overlap_end - overlap_start + 1
# Position inside the STR where overlap begins (0-based)
relative_pos = overlap_start - repeat_start
# Extract ALT substring corresponding to overlapping bases.
if len(ref) == len(alt):
# SNP-like: align by position (same offset as ref)
i0 = overlap_start - var_start
alt_overlap_raw = alt[i0 : i0 + overlap_len]
else:
# Indel-like: align from the end of ALT so that the last base of ALT
# corresponds to the last overlapping base of REF.
# ALT shorter or equal: use entire ALT
# Use only the suffix of ALT that covers the overlapping segment
alt_overlap_raw = alt if overlap_len >= len(alt) else alt[-overlap_len:]
panel_slice = repeat_seq[relative_pos : relative_pos + overlap_len]
# Case-normalize ALT overlap to match panel_slice
if panel_slice.islower():
alt_overlap = alt_overlap_raw.lower()
elif panel_slice.isupper():
alt_overlap = alt_overlap_raw.upper()
else:
alt_overlap = alt_overlap_raw
before_mut = repeat_seq[:relative_pos]
after_mut = repeat_seq[relative_pos + overlap_len :]
mutated = before_mut + alt_overlap + after_mut
return mutated
[docs]
def is_perfect_repeat(sequence: str, motif: str) -> bool:
"""Check if sequence is a perfect repeat of the motif.
A perfect repeat means the sequence consists entirely of exact copies
of the motif with no interruptions or variations.
Parameters
----------
sequence : str
DNA sequence to check
motif : str
Repeat unit motif
Returns
-------
bool
True if sequence is a perfect repeat, False otherwise
Examples
--------
>>> is_perfect_repeat('CAGCAGCAG', 'CAG')
True
>>> is_perfect_repeat('CAGCAGCA', 'CAG')
False
"""
if not sequence or not motif:
return False
count = count_repeat_units(sequence, motif)
return sequence == motif * count