Source code for simvx.core.ui.syntax_highlighter

"""Syntax highlighting tokenizer for Python source code.

Provides line-by-line tokenization with multi-line string state tracking,
token caching, and incremental cache invalidation.
"""

import re
from typing import TYPE_CHECKING

# ============================================================================
# Token sets
# ============================================================================

_KEYWORDS = frozenset(
    {
        "def",
        "class",
        "if",
        "else",
        "elif",
        "for",
        "while",
        "return",
        "import",
        "from",
        "as",
        "try",
        "except",
        "finally",
        "raise",
        "with",
        "yield",
        "lambda",
        "pass",
        "break",
        "continue",
        "and",
        "or",
        "not",
        "in",
        "is",
        "None",
        "True",
        "False",
        "global",
        "nonlocal",
        "assert",
        "del",
        "async",
        "await",
    }
)

_BUILTINS = frozenset(
    {
        "print",
        "len",
        "range",
        "str",
        "int",
        "float",
        "list",
        "dict",
        "tuple",
        "set",
        "type",
        "isinstance",
        "hasattr",
        "getattr",
        "setattr",
        "super",
        "self",
        "cls",
    }
)

_BRACKET_PAIRS = {"(": ")", "[": "]", "{": "}"}
_CLOSE_TO_OPEN = {v: k for k, v in _BRACKET_PAIRS.items()}
_ALL_BRACKETS = frozenset(_BRACKET_PAIRS.keys()) | frozenset(_BRACKET_PAIRS.values())

# Regex for number literals (int and float, including scientific notation)
_NUMBER_RE = re.compile(
    r"(?<![a-zA-Z_])"
    r"(?:"
    r"0[xX][0-9a-fA-F_]+"  # hex
    r"|0[oO][0-7_]+"  # octal
    r"|0[bB][01_]+"  # binary
    r"|\d[\d_]*\.[\d_]*(?:[eE][+-]?\d+)?"  # float with dot
    r"|\.[\d_]+(?:[eE][+-]?\d+)?"  # float starting with dot
    r"|\d[\d_]*[eE][+-]?\d+"  # float scientific only
    r"|\d[\d_]*"  # integer
    r")"
    r"[jJ]?"  # complex suffix
)


[docs]
class SyntaxHighlighterMixin:
    """Mixin providing Python syntax tokenization and caching.

    The host class (``CodeTextEdit``) supplies the line buffer, cursor
    position, and the syntax-highlighting toggle; those members are
    declared below under ``TYPE_CHECKING`` so the type checker sees the
    contract without creating runtime attributes.
    """

    if TYPE_CHECKING:
        # Provided by MultiLineTextEdit host.
        _lines: list[str]
        _cursor_line: int
        _cursor_col: int

        # Provided by CodeTextEdit host (Property descriptor).
        syntax_highlighting: bool

    def _init_syntax_highlighter(self):
        """Initialise tokenizer caches. Call from ``__init__`` before text is set."""
        self._token_cache: dict[int, tuple[int, list[tuple[str, str]]]] = {}
        self._multiline_string_lines: set[int] = set()
        self._ml_states_cache: list | None = None

    # ================================================================
    # Tokenizer
    # ================================================================

    def _tokenize_line(
        self, line: str, in_multiline_string: bool = False, multiline_quote: str = ""
    ) -> list[tuple[str, str]]:
        """Tokenize a single line into (text, token_type) pairs.

        Args:
            line: The source line to tokenize.
            in_multiline_string: Whether we are inside a triple-quoted string
                from a previous line.
            multiline_quote: The quote style of the open triple-quote
                ('\"\"\"' or \"'''\").

        Returns:
            List of (token_text, token_type) tuples covering the full line.
        """
        tokens: list[tuple[str, str]] = []
        i = 0
        n = len(line)

        while i < n:
            ch = line[i]

            # ----- Inside a multi-line string continuation -----
            if in_multiline_string:
                end_idx = line.find(multiline_quote, i)
                if end_idx == -1:
                    # Entire rest of line is string
                    tokens.append((line[i:], "string"))
                    return tokens
                else:
                    end_idx += 3
                    tokens.append((line[i:end_idx], "string"))
                    i = end_idx
                    in_multiline_string = False
                    multiline_quote = ""
                    continue

            # ----- Comment -----
            if ch == "#":
                tokens.append((line[i:], "comment"))
                return tokens

            # ----- Strings (triple-quoted and single-quoted) -----
            if ch in ('"', "'"):
                # Check for triple quote
                triple = line[i : i + 3]
                if triple in ('"""', "'''"):
                    # Find closing triple quote
                    end_idx = line.find(triple, i + 3)
                    if end_idx == -1:
                        # Multi-line string starts, rest of line is string
                        tokens.append((line[i:], "string"))
                        return tokens
                    else:
                        end_idx += 3
                        tokens.append((line[i:end_idx], "string"))
                        i = end_idx
                        continue
                else:
                    # Single-line string
                    quote_char = ch
                    j = i + 1
                    while j < n:
                        if line[j] == "\\":
                            j += 2  # skip escaped character
                            continue
                        if line[j] == quote_char:
                            j += 1
                            break
                        j += 1
                    tokens.append((line[i:j], "string"))
                    i = j
                    continue

            # ----- Decorator -----
            if ch == "@" and (i == 0 or line[:i].isspace()):
                # Consume the rest as decorator (up to parenthesis or end)
                j = i + 1
                while j < n and (line[j].isalnum() or line[j] in "_."):
                    j += 1
                tokens.append((line[i:j], "decorator"))
                i = j
                continue

            # ----- Numbers -----
            if ch.isdigit() or (ch == "." and i + 1 < n and line[i + 1].isdigit()):
                # Ensure not preceded by identifier character
                if i == 0 or not (line[i - 1].isalnum() or line[i - 1] == "_"):
                    m = _NUMBER_RE.match(line, i)
                    if m:
                        tokens.append((m.group(), "number"))
                        i = m.end()
                        continue

            # ----- Identifiers and keywords -----
            if ch.isalpha() or ch == "_":
                j = i + 1
                while j < n and (line[j].isalnum() or line[j] == "_"):
                    j += 1
                word = line[i:j]
                if word in _KEYWORDS:
                    tokens.append((word, "keyword"))
                elif word in _BUILTINS:
                    tokens.append((word, "builtin"))
                else:
                    tokens.append((word, "normal"))
                i = j
                continue

            # ----- Whitespace and operators (normal) -----
            j = i + 1
            # Batch consecutive non-special characters
            while j < n:
                c = line[j]
                if (
                    c.isalpha()
                    or c == "_"
                    or c.isdigit()
                    or c in ('"', "'", "#", "@")
                    or (c == "." and j + 1 < n and line[j + 1].isdigit())
                ):
                    break
                j += 1
            tokens.append((line[i:j], "normal"))
            i = j

        return tokens

    @staticmethod
    def _line_end_state(line: str, in_ml: bool, ml_quote: str) -> tuple[bool, str]:
        """Compute the multiline string state after processing a single line."""
        i, n = 0, len(line)
        while i < n:
            if in_ml:
                idx = line.find(ml_quote, i)
                if idx == -1:
                    return (True, ml_quote)
                i = idx + 3
                in_ml = False
                ml_quote = ""
            else:
                ch = line[i]
                if ch == "#":
                    break
                if ch in ('"', "'"):
                    triple = line[i : i + 3]
                    if triple in ('"""', "'''"):
                        end = line.find(triple, i + 3)
                        if end == -1:
                            return (True, triple)
                        i = end + 3
                        continue
                    else:
                        q = ch
                        j = i + 1
                        while j < n:
                            if line[j] == "\\":
                                j += 2
                                continue
                            if line[j] == q:
                                j += 1
                                break
                            j += 1
                        i = j
                        continue
                i += 1
        return (in_ml, ml_quote)

    def _build_multiline_state(self) -> list[tuple[bool, str]]:
        """Determine which lines start inside a triple-quoted string.

        Returns a list parallel to self._lines, each entry being
        (in_multiline, quote_style) at the *start* of that line.
        """
        states: list[tuple[bool, str]] = []
        in_ml = False
        ml_quote = ""
        for line in self._lines:
            states.append((in_ml, ml_quote))
            in_ml, ml_quote = self._line_end_state(line, in_ml, ml_quote)
        return states

    def _get_multiline_states(self) -> list:
        """Get cached multiline string states, rebuilding if needed."""
        if self._ml_states_cache is None:
            self._ml_states_cache = self._build_multiline_state()
        return self._ml_states_cache

    def _update_multiline_states(self, from_line: int):
        """Incrementally update multiline states from a given line.

        Recomputes states from from_line onward, stopping early when the
        new state matches the old state (convergence). Only invalidates
        token cache for lines whose multiline state actually changed.
        """
        old_states = self._ml_states_cache
        if old_states is None:
            return  # Will rebuild fully on next access

        num_lines = len(self._lines)

        # Determine starting state from previous line
        if from_line == 0:
            in_ml, ml_quote = False, ""
        elif from_line < len(old_states):
            # Use state at from_line's start from the line before it
            prev_line = self._lines[from_line - 1] if from_line - 1 < num_lines else ""
            prev_start = old_states[from_line - 1] if from_line - 1 < len(old_states) else (False, "")
            in_ml, ml_quote = self._line_end_state(prev_line, prev_start[0], prev_start[1])
        else:
            in_ml, ml_quote = False, ""

        # Resize states list to match _lines
        while len(old_states) < num_lines:
            old_states.append((False, ""))
        while len(old_states) > num_lines:
            old_states.pop()

        # Update from from_line onward, stopping at convergence
        for i in range(from_line, num_lines):
            new_state = (in_ml, ml_quote)
            if old_states[i] == new_state and i > from_line:
                # States have converged -- no further changes needed
                break
            if old_states[i] != new_state:
                # Multiline state changed for this line -- invalidate its token cache
                self._token_cache.pop(i, None)
            old_states[i] = new_state
            in_ml, ml_quote = self._line_end_state(self._lines[i], in_ml, ml_quote)

    def _get_line_tokens(self, line_idx: int, ml_states: list[tuple[bool, str]] | None = None) -> list[tuple[str, str]]:
        """Get tokens for a line, using cache when possible.

        Args:
            line_idx: Index into self._lines.
            ml_states: Pre-computed multiline string states (optional).

        Returns:
            List of (token_text, token_type) pairs.
        """
        if line_idx < 0 or line_idx >= len(self._lines):
            return []

        line = self._lines[line_idx]

        # Skip tokenization when syntax highlighting is disabled (large files)
        if not self.syntax_highlighting:
            return [(line, "normal")]

        line_hash = hash(line)

        # Check cache
        cached = self._token_cache.get(line_idx)
        if cached and cached[0] == line_hash:
            return cached[1]

        # Determine multiline state for this line
        in_ml = False
        ml_quote = ""
        if ml_states and line_idx < len(ml_states):
            in_ml, ml_quote = ml_states[line_idx]

        tokens = self._tokenize_line(line, in_ml, ml_quote)
        self._token_cache[line_idx] = (line_hash, tokens)
        return tokens

    def _invalidate_cache(self, from_line: int = 0):
        """Invalidate token cache from a given line onward.

        Uses incremental multiline state update when possible -- only
        invalidates token cache for lines whose state actually changed.
        """
        # Always invalidate the edited line itself
        self._token_cache.pop(from_line, None)
        if self._ml_states_cache is not None:
            # Incremental update: recompute from from_line, stop at convergence
            self._update_multiline_states(from_line)
        # else: full rebuild will happen on next _get_multiline_states() call

    # ================================================================
    # Bracket matching
    # ================================================================

    def _find_matching_bracket(self) -> tuple[int, int] | None:
        """Find the matching bracket for the bracket at or adjacent to cursor.

        Returns (line, col) of the matching bracket, or None.
        """
        line_idx = self._cursor_line
        col = self._cursor_col
        line = self._lines[line_idx]

        # Check character at cursor and before cursor
        bracket_char = None
        bracket_col = -1

        if col < len(line) and line[col] in _ALL_BRACKETS:
            bracket_char = line[col]
            bracket_col = col
        elif col > 0 and line[col - 1] in _ALL_BRACKETS:
            bracket_char = line[col - 1]
            bracket_col = col - 1

        if bracket_char is None:
            return None

        # Determine direction
        if bracket_char in _BRACKET_PAIRS:
            # Opening bracket -- search forward
            target = _BRACKET_PAIRS[bracket_char]
            return self._search_bracket_forward(line_idx, bracket_col, bracket_char, target)
        elif bracket_char in _CLOSE_TO_OPEN:
            # Closing bracket -- search backward
            target = _CLOSE_TO_OPEN[bracket_char]
            return self._search_bracket_backward(line_idx, bracket_col, bracket_char, target)

        return None

    def _search_bracket_forward(
        self, start_line: int, start_col: int, open_ch: str, close_ch: str
    ) -> tuple[int, int] | None:
        """Search forward for matching closing bracket."""
        depth = 0
        for li in range(start_line, len(self._lines)):
            line = self._lines[li]
            start = start_col + 1 if li == start_line else 0
            for ci in range(start, len(line)):
                ch = line[ci]
                if ch == open_ch:
                    depth += 1
                elif ch == close_ch:
                    if depth == 0:
                        return (li, ci)
                    depth -= 1
        return None

    def _search_bracket_backward(
        self, start_line: int, start_col: int, close_ch: str, open_ch: str
    ) -> tuple[int, int] | None:
        """Search backward for matching opening bracket."""
        depth = 0
        for li in range(start_line, -1, -1):
            line = self._lines[li]
            end = start_col - 1 if li == start_line else len(line) - 1
            for ci in range(end, -1, -1):
                ch = line[ci]
                if ch == close_ch:
                    depth += 1
                elif ch == open_ch:
                    if depth == 0:
                        return (li, ci)
                    depth -= 1
        return None