"""Syntax highlighting tokenizer for Python source code.
Provides line-by-line tokenization with multi-line string state tracking,
token caching, and incremental cache invalidation.
"""
from __future__ import annotations
import re
# ============================================================================
# Token sets
# ============================================================================
_KEYWORDS = frozenset(
{
"def",
"class",
"if",
"else",
"elif",
"for",
"while",
"return",
"import",
"from",
"as",
"try",
"except",
"finally",
"raise",
"with",
"yield",
"lambda",
"pass",
"break",
"continue",
"and",
"or",
"not",
"in",
"is",
"None",
"True",
"False",
"global",
"nonlocal",
"assert",
"del",
"async",
"await",
}
)
_BUILTINS = frozenset(
{
"print",
"len",
"range",
"str",
"int",
"float",
"list",
"dict",
"tuple",
"set",
"type",
"isinstance",
"hasattr",
"getattr",
"setattr",
"super",
"self",
"cls",
}
)
_BRACKET_PAIRS = {"(": ")", "[": "]", "{": "}"}
_CLOSE_TO_OPEN = {v: k for k, v in _BRACKET_PAIRS.items()}
_ALL_BRACKETS = frozenset(_BRACKET_PAIRS.keys()) | frozenset(_BRACKET_PAIRS.values())
# Regex for number literals (int and float, including scientific notation)
_NUMBER_RE = re.compile(
r"(?<![a-zA-Z_])"
r"(?:"
r"0[xX][0-9a-fA-F_]+" # hex
r"|0[oO][0-7_]+" # octal
r"|0[bB][01_]+" # binary
r"|\d[\d_]*\.[\d_]*(?:[eE][+-]?\d+)?" # float with dot
r"|\.[\d_]+(?:[eE][+-]?\d+)?" # float starting with dot
r"|\d[\d_]*[eE][+-]?\d+" # float scientific only
r"|\d[\d_]*" # integer
r")"
r"[jJ]?" # complex suffix
)
[docs]
class SyntaxHighlighterMixin:
"""Mixin providing Python syntax tokenization and caching.
Expects the host class to have:
- ``_lines: list[str]``
- ``syntax_highlighting: bool``
"""
def _init_syntax_highlighter(self):
"""Initialise tokenizer caches. Call from ``__init__`` before text is set."""
self._token_cache: dict[int, tuple[int, list[tuple[str, str]]]] = {}
self._multiline_string_lines: set[int] = set()
self._ml_states_cache: list | None = None
# ================================================================
# Tokenizer
# ================================================================
def _tokenize_line(
self, line: str, in_multiline_string: bool = False, multiline_quote: str = ""
) -> list[tuple[str, str]]:
"""Tokenize a single line into (text, token_type) pairs.
Args:
line: The source line to tokenize.
in_multiline_string: Whether we are inside a triple-quoted string
from a previous line.
multiline_quote: The quote style of the open triple-quote
('\"\"\"' or \"'''\").
Returns:
List of (token_text, token_type) tuples covering the full line.
"""
tokens: list[tuple[str, str]] = []
i = 0
n = len(line)
while i < n:
ch = line[i]
# ----- Inside a multi-line string continuation -----
if in_multiline_string:
end_idx = line.find(multiline_quote, i)
if end_idx == -1:
# Entire rest of line is string
tokens.append((line[i:], "string"))
return tokens
else:
end_idx += 3
tokens.append((line[i:end_idx], "string"))
i = end_idx
in_multiline_string = False
multiline_quote = ""
continue
# ----- Comment -----
if ch == "#":
tokens.append((line[i:], "comment"))
return tokens
# ----- Strings (triple-quoted and single-quoted) -----
if ch in ('"', "'"):
# Check for triple quote
triple = line[i : i + 3]
if triple in ('"""', "'''"):
# Find closing triple quote
end_idx = line.find(triple, i + 3)
if end_idx == -1:
# Multi-line string starts, rest of line is string
tokens.append((line[i:], "string"))
return tokens
else:
end_idx += 3
tokens.append((line[i:end_idx], "string"))
i = end_idx
continue
else:
# Single-line string
quote_char = ch
j = i + 1
while j < n:
if line[j] == "\\":
j += 2 # skip escaped character
continue
if line[j] == quote_char:
j += 1
break
j += 1
tokens.append((line[i:j], "string"))
i = j
continue
# ----- Decorator -----
if ch == "@" and (i == 0 or line[:i].isspace()):
# Consume the rest as decorator (up to parenthesis or end)
j = i + 1
while j < n and (line[j].isalnum() or line[j] in "_."):
j += 1
tokens.append((line[i:j], "decorator"))
i = j
continue
# ----- Numbers -----
if ch.isdigit() or (ch == "." and i + 1 < n and line[i + 1].isdigit()):
# Ensure not preceded by identifier character
if i == 0 or not (line[i - 1].isalnum() or line[i - 1] == "_"):
m = _NUMBER_RE.match(line, i)
if m:
tokens.append((m.group(), "number"))
i = m.end()
continue
# ----- Identifiers and keywords -----
if ch.isalpha() or ch == "_":
j = i + 1
while j < n and (line[j].isalnum() or line[j] == "_"):
j += 1
word = line[i:j]
if word in _KEYWORDS:
tokens.append((word, "keyword"))
elif word in _BUILTINS:
tokens.append((word, "builtin"))
else:
tokens.append((word, "normal"))
i = j
continue
# ----- Whitespace and operators (normal) -----
j = i + 1
# Batch consecutive non-special characters
while j < n:
c = line[j]
if (
c.isalpha()
or c == "_"
or c.isdigit()
or c in ('"', "'", "#", "@")
or (c == "." and j + 1 < n and line[j + 1].isdigit())
):
break
j += 1
tokens.append((line[i:j], "normal"))
i = j
return tokens
@staticmethod
def _line_end_state(line: str, in_ml: bool, ml_quote: str) -> tuple[bool, str]:
"""Compute the multiline string state after processing a single line."""
i, n = 0, len(line)
while i < n:
if in_ml:
idx = line.find(ml_quote, i)
if idx == -1:
return (True, ml_quote)
i = idx + 3
in_ml = False
ml_quote = ""
else:
ch = line[i]
if ch == "#":
break
if ch in ('"', "'"):
triple = line[i : i + 3]
if triple in ('"""', "'''"):
end = line.find(triple, i + 3)
if end == -1:
return (True, triple)
i = end + 3
continue
else:
q = ch
j = i + 1
while j < n:
if line[j] == "\\":
j += 2
continue
if line[j] == q:
j += 1
break
j += 1
i = j
continue
i += 1
return (in_ml, ml_quote)
def _build_multiline_state(self) -> list[tuple[bool, str]]:
"""Determine which lines start inside a triple-quoted string.
Returns a list parallel to self._lines, each entry being
(in_multiline, quote_style) at the *start* of that line.
"""
states: list[tuple[bool, str]] = []
in_ml = False
ml_quote = ""
for line in self._lines:
states.append((in_ml, ml_quote))
in_ml, ml_quote = self._line_end_state(line, in_ml, ml_quote)
return states
def _get_multiline_states(self) -> list:
"""Get cached multiline string states, rebuilding if needed."""
if self._ml_states_cache is None:
self._ml_states_cache = self._build_multiline_state()
return self._ml_states_cache
def _update_multiline_states(self, from_line: int):
"""Incrementally update multiline states from a given line.
Recomputes states from from_line onward, stopping early when the
new state matches the old state (convergence). Only invalidates
token cache for lines whose multiline state actually changed.
"""
old_states = self._ml_states_cache
if old_states is None:
return # Will rebuild fully on next access
num_lines = len(self._lines)
# Determine starting state from previous line
if from_line == 0:
in_ml, ml_quote = False, ""
elif from_line < len(old_states):
# Use state at from_line's start from the line before it
prev_line = self._lines[from_line - 1] if from_line - 1 < num_lines else ""
prev_start = old_states[from_line - 1] if from_line - 1 < len(old_states) else (False, "")
in_ml, ml_quote = self._line_end_state(prev_line, prev_start[0], prev_start[1])
else:
in_ml, ml_quote = False, ""
# Resize states list to match _lines
while len(old_states) < num_lines:
old_states.append((False, ""))
while len(old_states) > num_lines:
old_states.pop()
# Update from from_line onward, stopping at convergence
for i in range(from_line, num_lines):
new_state = (in_ml, ml_quote)
if old_states[i] == new_state and i > from_line:
# States have converged -- no further changes needed
break
if old_states[i] != new_state:
# Multiline state changed for this line -- invalidate its token cache
self._token_cache.pop(i, None)
old_states[i] = new_state
in_ml, ml_quote = self._line_end_state(self._lines[i], in_ml, ml_quote)
def _get_line_tokens(self, line_idx: int, ml_states: list[tuple[bool, str]] | None = None) -> list[tuple[str, str]]:
"""Get tokens for a line, using cache when possible.
Args:
line_idx: Index into self._lines.
ml_states: Pre-computed multiline string states (optional).
Returns:
List of (token_text, token_type) pairs.
"""
if line_idx < 0 or line_idx >= len(self._lines):
return []
line = self._lines[line_idx]
# Skip tokenization when syntax highlighting is disabled (large files)
if not self.syntax_highlighting:
return [(line, "normal")]
line_hash = hash(line)
# Check cache
cached = self._token_cache.get(line_idx)
if cached and cached[0] == line_hash:
return cached[1]
# Determine multiline state for this line
in_ml = False
ml_quote = ""
if ml_states and line_idx < len(ml_states):
in_ml, ml_quote = ml_states[line_idx]
tokens = self._tokenize_line(line, in_ml, ml_quote)
self._token_cache[line_idx] = (line_hash, tokens)
return tokens
def _invalidate_cache(self, from_line: int = 0):
"""Invalidate token cache from a given line onward.
Uses incremental multiline state update when possible -- only
invalidates token cache for lines whose state actually changed.
"""
# Always invalidate the edited line itself
self._token_cache.pop(from_line, None)
if self._ml_states_cache is not None:
# Incremental update: recompute from from_line, stop at convergence
self._update_multiline_states(from_line)
# else: full rebuild will happen on next _get_multiline_states() call
# ================================================================
# Bracket matching
# ================================================================
def _find_matching_bracket(self) -> tuple[int, int] | None:
"""Find the matching bracket for the bracket at or adjacent to cursor.
Returns (line, col) of the matching bracket, or None.
"""
line_idx = self._cursor_line
col = self._cursor_col
line = self._lines[line_idx]
# Check character at cursor and before cursor
bracket_char = None
bracket_col = -1
if col < len(line) and line[col] in _ALL_BRACKETS:
bracket_char = line[col]
bracket_col = col
elif col > 0 and line[col - 1] in _ALL_BRACKETS:
bracket_char = line[col - 1]
bracket_col = col - 1
if bracket_char is None:
return None
# Determine direction
if bracket_char in _BRACKET_PAIRS:
# Opening bracket -- search forward
target = _BRACKET_PAIRS[bracket_char]
return self._search_bracket_forward(line_idx, bracket_col, bracket_char, target)
elif bracket_char in _CLOSE_TO_OPEN:
# Closing bracket -- search backward
target = _CLOSE_TO_OPEN[bracket_char]
return self._search_bracket_backward(line_idx, bracket_col, bracket_char, target)
return None
def _search_bracket_forward(
self, start_line: int, start_col: int, open_ch: str, close_ch: str
) -> tuple[int, int] | None:
"""Search forward for matching closing bracket."""
depth = 0
for li in range(start_line, len(self._lines)):
line = self._lines[li]
start = start_col + 1 if li == start_line else 0
for ci in range(start, len(line)):
ch = line[ci]
if ch == open_ch:
depth += 1
elif ch == close_ch:
if depth == 0:
return (li, ci)
depth -= 1
return None
def _search_bracket_backward(
self, start_line: int, start_col: int, close_ch: str, open_ch: str
) -> tuple[int, int] | None:
"""Search backward for matching opening bracket."""
depth = 0
for li in range(start_line, -1, -1):
line = self._lines[li]
end = start_col - 1 if li == start_line else len(line) - 1
for ci in range(end, -1, -1):
ch = line[ci]
if ch == close_ch:
depth += 1
elif ch == open_ch:
if depth == 0:
return (li, ci)
depth -= 1
return None