Source code for simvx.core.ui.syntax_highlighter

"""Syntax highlighting tokenizer for Python source code.

Provides line-by-line tokenization with multi-line string state tracking,
token caching, and incremental cache invalidation.
"""

from __future__ import annotations

import re

# ============================================================================
# Token sets
# ============================================================================

_KEYWORDS = frozenset(
    {
        "def",
        "class",
        "if",
        "else",
        "elif",
        "for",
        "while",
        "return",
        "import",
        "from",
        "as",
        "try",
        "except",
        "finally",
        "raise",
        "with",
        "yield",
        "lambda",
        "pass",
        "break",
        "continue",
        "and",
        "or",
        "not",
        "in",
        "is",
        "None",
        "True",
        "False",
        "global",
        "nonlocal",
        "assert",
        "del",
        "async",
        "await",
    }
)

_BUILTINS = frozenset(
    {
        "print",
        "len",
        "range",
        "str",
        "int",
        "float",
        "list",
        "dict",
        "tuple",
        "set",
        "type",
        "isinstance",
        "hasattr",
        "getattr",
        "setattr",
        "super",
        "self",
        "cls",
    }
)

_BRACKET_PAIRS = {"(": ")", "[": "]", "{": "}"}
_CLOSE_TO_OPEN = {v: k for k, v in _BRACKET_PAIRS.items()}
_ALL_BRACKETS = frozenset(_BRACKET_PAIRS.keys()) | frozenset(_BRACKET_PAIRS.values())

# Regex for number literals (int and float, including scientific notation)
_NUMBER_RE = re.compile(
    r"(?<![a-zA-Z_])"
    r"(?:"
    r"0[xX][0-9a-fA-F_]+"  # hex
    r"|0[oO][0-7_]+"  # octal
    r"|0[bB][01_]+"  # binary
    r"|\d[\d_]*\.[\d_]*(?:[eE][+-]?\d+)?"  # float with dot
    r"|\.[\d_]+(?:[eE][+-]?\d+)?"  # float starting with dot
    r"|\d[\d_]*[eE][+-]?\d+"  # float scientific only
    r"|\d[\d_]*"  # integer
    r")"
    r"[jJ]?"  # complex suffix
)


[docs] class SyntaxHighlighterMixin: """Mixin providing Python syntax tokenization and caching. Expects the host class to have: - ``_lines: list[str]`` - ``syntax_highlighting: bool`` """ def _init_syntax_highlighter(self): """Initialise tokenizer caches. Call from ``__init__`` before text is set.""" self._token_cache: dict[int, tuple[int, list[tuple[str, str]]]] = {} self._multiline_string_lines: set[int] = set() self._ml_states_cache: list | None = None # ================================================================ # Tokenizer # ================================================================ def _tokenize_line( self, line: str, in_multiline_string: bool = False, multiline_quote: str = "" ) -> list[tuple[str, str]]: """Tokenize a single line into (text, token_type) pairs. Args: line: The source line to tokenize. in_multiline_string: Whether we are inside a triple-quoted string from a previous line. multiline_quote: The quote style of the open triple-quote ('\"\"\"' or \"'''\"). Returns: List of (token_text, token_type) tuples covering the full line. """ tokens: list[tuple[str, str]] = [] i = 0 n = len(line) while i < n: ch = line[i] # ----- Inside a multi-line string continuation ----- if in_multiline_string: end_idx = line.find(multiline_quote, i) if end_idx == -1: # Entire rest of line is string tokens.append((line[i:], "string")) return tokens else: end_idx += 3 tokens.append((line[i:end_idx], "string")) i = end_idx in_multiline_string = False multiline_quote = "" continue # ----- Comment ----- if ch == "#": tokens.append((line[i:], "comment")) return tokens # ----- Strings (triple-quoted and single-quoted) ----- if ch in ('"', "'"): # Check for triple quote triple = line[i : i + 3] if triple in ('"""', "'''"): # Find closing triple quote end_idx = line.find(triple, i + 3) if end_idx == -1: # Multi-line string starts, rest of line is string tokens.append((line[i:], "string")) return tokens else: end_idx += 3 tokens.append((line[i:end_idx], "string")) i = end_idx continue else: # Single-line string quote_char = ch j = i + 1 while j < n: if line[j] == "\\": j += 2 # skip escaped character continue if line[j] == quote_char: j += 1 break j += 1 tokens.append((line[i:j], "string")) i = j continue # ----- Decorator ----- if ch == "@" and (i == 0 or line[:i].isspace()): # Consume the rest as decorator (up to parenthesis or end) j = i + 1 while j < n and (line[j].isalnum() or line[j] in "_."): j += 1 tokens.append((line[i:j], "decorator")) i = j continue # ----- Numbers ----- if ch.isdigit() or (ch == "." and i + 1 < n and line[i + 1].isdigit()): # Ensure not preceded by identifier character if i == 0 or not (line[i - 1].isalnum() or line[i - 1] == "_"): m = _NUMBER_RE.match(line, i) if m: tokens.append((m.group(), "number")) i = m.end() continue # ----- Identifiers and keywords ----- if ch.isalpha() or ch == "_": j = i + 1 while j < n and (line[j].isalnum() or line[j] == "_"): j += 1 word = line[i:j] if word in _KEYWORDS: tokens.append((word, "keyword")) elif word in _BUILTINS: tokens.append((word, "builtin")) else: tokens.append((word, "normal")) i = j continue # ----- Whitespace and operators (normal) ----- j = i + 1 # Batch consecutive non-special characters while j < n: c = line[j] if ( c.isalpha() or c == "_" or c.isdigit() or c in ('"', "'", "#", "@") or (c == "." and j + 1 < n and line[j + 1].isdigit()) ): break j += 1 tokens.append((line[i:j], "normal")) i = j return tokens @staticmethod def _line_end_state(line: str, in_ml: bool, ml_quote: str) -> tuple[bool, str]: """Compute the multiline string state after processing a single line.""" i, n = 0, len(line) while i < n: if in_ml: idx = line.find(ml_quote, i) if idx == -1: return (True, ml_quote) i = idx + 3 in_ml = False ml_quote = "" else: ch = line[i] if ch == "#": break if ch in ('"', "'"): triple = line[i : i + 3] if triple in ('"""', "'''"): end = line.find(triple, i + 3) if end == -1: return (True, triple) i = end + 3 continue else: q = ch j = i + 1 while j < n: if line[j] == "\\": j += 2 continue if line[j] == q: j += 1 break j += 1 i = j continue i += 1 return (in_ml, ml_quote) def _build_multiline_state(self) -> list[tuple[bool, str]]: """Determine which lines start inside a triple-quoted string. Returns a list parallel to self._lines, each entry being (in_multiline, quote_style) at the *start* of that line. """ states: list[tuple[bool, str]] = [] in_ml = False ml_quote = "" for line in self._lines: states.append((in_ml, ml_quote)) in_ml, ml_quote = self._line_end_state(line, in_ml, ml_quote) return states def _get_multiline_states(self) -> list: """Get cached multiline string states, rebuilding if needed.""" if self._ml_states_cache is None: self._ml_states_cache = self._build_multiline_state() return self._ml_states_cache def _update_multiline_states(self, from_line: int): """Incrementally update multiline states from a given line. Recomputes states from from_line onward, stopping early when the new state matches the old state (convergence). Only invalidates token cache for lines whose multiline state actually changed. """ old_states = self._ml_states_cache if old_states is None: return # Will rebuild fully on next access num_lines = len(self._lines) # Determine starting state from previous line if from_line == 0: in_ml, ml_quote = False, "" elif from_line < len(old_states): # Use state at from_line's start from the line before it prev_line = self._lines[from_line - 1] if from_line - 1 < num_lines else "" prev_start = old_states[from_line - 1] if from_line - 1 < len(old_states) else (False, "") in_ml, ml_quote = self._line_end_state(prev_line, prev_start[0], prev_start[1]) else: in_ml, ml_quote = False, "" # Resize states list to match _lines while len(old_states) < num_lines: old_states.append((False, "")) while len(old_states) > num_lines: old_states.pop() # Update from from_line onward, stopping at convergence for i in range(from_line, num_lines): new_state = (in_ml, ml_quote) if old_states[i] == new_state and i > from_line: # States have converged -- no further changes needed break if old_states[i] != new_state: # Multiline state changed for this line -- invalidate its token cache self._token_cache.pop(i, None) old_states[i] = new_state in_ml, ml_quote = self._line_end_state(self._lines[i], in_ml, ml_quote) def _get_line_tokens(self, line_idx: int, ml_states: list[tuple[bool, str]] | None = None) -> list[tuple[str, str]]: """Get tokens for a line, using cache when possible. Args: line_idx: Index into self._lines. ml_states: Pre-computed multiline string states (optional). Returns: List of (token_text, token_type) pairs. """ if line_idx < 0 or line_idx >= len(self._lines): return [] line = self._lines[line_idx] # Skip tokenization when syntax highlighting is disabled (large files) if not self.syntax_highlighting: return [(line, "normal")] line_hash = hash(line) # Check cache cached = self._token_cache.get(line_idx) if cached and cached[0] == line_hash: return cached[1] # Determine multiline state for this line in_ml = False ml_quote = "" if ml_states and line_idx < len(ml_states): in_ml, ml_quote = ml_states[line_idx] tokens = self._tokenize_line(line, in_ml, ml_quote) self._token_cache[line_idx] = (line_hash, tokens) return tokens def _invalidate_cache(self, from_line: int = 0): """Invalidate token cache from a given line onward. Uses incremental multiline state update when possible -- only invalidates token cache for lines whose state actually changed. """ # Always invalidate the edited line itself self._token_cache.pop(from_line, None) if self._ml_states_cache is not None: # Incremental update: recompute from from_line, stop at convergence self._update_multiline_states(from_line) # else: full rebuild will happen on next _get_multiline_states() call # ================================================================ # Bracket matching # ================================================================ def _find_matching_bracket(self) -> tuple[int, int] | None: """Find the matching bracket for the bracket at or adjacent to cursor. Returns (line, col) of the matching bracket, or None. """ line_idx = self._cursor_line col = self._cursor_col line = self._lines[line_idx] # Check character at cursor and before cursor bracket_char = None bracket_col = -1 if col < len(line) and line[col] in _ALL_BRACKETS: bracket_char = line[col] bracket_col = col elif col > 0 and line[col - 1] in _ALL_BRACKETS: bracket_char = line[col - 1] bracket_col = col - 1 if bracket_char is None: return None # Determine direction if bracket_char in _BRACKET_PAIRS: # Opening bracket -- search forward target = _BRACKET_PAIRS[bracket_char] return self._search_bracket_forward(line_idx, bracket_col, bracket_char, target) elif bracket_char in _CLOSE_TO_OPEN: # Closing bracket -- search backward target = _CLOSE_TO_OPEN[bracket_char] return self._search_bracket_backward(line_idx, bracket_col, bracket_char, target) return None def _search_bracket_forward( self, start_line: int, start_col: int, open_ch: str, close_ch: str ) -> tuple[int, int] | None: """Search forward for matching closing bracket.""" depth = 0 for li in range(start_line, len(self._lines)): line = self._lines[li] start = start_col + 1 if li == start_line else 0 for ci in range(start, len(line)): ch = line[ci] if ch == open_ch: depth += 1 elif ch == close_ch: if depth == 0: return (li, ci) depth -= 1 return None def _search_bracket_backward( self, start_line: int, start_col: int, close_ch: str, open_ch: str ) -> tuple[int, int] | None: """Search backward for matching opening bracket.""" depth = 0 for li in range(start_line, -1, -1): line = self._lines[li] end = start_col - 1 if li == start_line else len(line) - 1 for ci in range(end, -1, -1): ch = line[ci] if ch == close_ch: depth += 1 elif ch == open_ch: if depth == 0: return (li, ci) depth -= 1 return None