Source code for simvx.core.testing.benchmark

"""Performance-benchmark harness for SimVX's opt-in regression suite.

This module is the single home for the benchmark plumbing shared by every
``@pytest.mark.perf`` suite (across packages) and by ``tools/run_benchmarks.py``:

* :class:`BenchmarkResult` -- one measured data point (frame timing + metadata).
* :func:`bench_scene_runner` / :func:`bench_headless_render` -- run a scene and
  measure it (CPU-only via :class:`~simvx.core.testing.SceneRunner`, or a full
  headless Vulkan render; the latter lazy-imports ``simvx.graphics``).
* :class:`MachineInfo` -- host/CPU/GPU/interpreter snapshot stamped onto records.
* :class:`HistoryStore` -- append-only per-machine history + pinned baselines.
* :class:`PerfRecorder` -- records results, compares against the machine's own
  baseline, and (outside report-only mode) fails when a metric regresses.

These utilities measure **speed only**: correctness lives in the normal suites.
Results are stored locally under the perf directory (:func:`perf_dir`) and are
never committed -- absolute numbers are machine-specific, so regressions are
judged against this machine's own recorded baseline, not a hardcoded threshold.
"""

from __future__ import annotations

import json
import os
import platform
import subprocess
import sys
import time
from collections.abc import Callable
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

from .diagnostics import FrameTimer, NodeCounter
from .scene_runner import SceneRunner

__all__ = [
    "BenchmarkResult",
    "HistoryStore",
    "MachineInfo",
    "PerfRecord",
    "PerfRecorder",
    "bench_headless_render",
    "bench_scene_runner",
    "compare",
    "make_perf_recorder",
    "perf_dir",
    "perf_pytest_addoption",
    "record_key",
]

# Default regression tolerance: a metric must move more than this fraction
# against the baseline to count as a regression/improvement (else "ok").
DEFAULT_TOLERANCE = 0.20


# ============================================================================
# Result record
# ============================================================================


[docs] @dataclass class BenchmarkResult: """One measured benchmark point. The headline metric is :meth:`metric_value`. By default it is the average frame time in ms (``metric="frame_ms"``, lower is better). A limit-style bench (e.g. "max sprites at 60 FPS") sets ``metric``, ``value`` and ``lower_is_better=False`` so the comparison treats higher as better. """ name: str count: int = 0 total_ms: float = 0.0 frames: int = 0 # Per-frame wall-clock samples (ms); drives percentile properties. samples_ms: list[float] = field(default_factory=list) # Mean GPU time per frame (ms) from app telemetry, when rendering. gpu_ms: float | None = None backend: str = "default" metric: str = "frame_ms" # Explicit headline value for non-frame-time metrics (e.g. an object count). value: float | None = None lower_is_better: bool = True extra: dict[str, float] = field(default_factory=dict)
[docs] @property def avg_frame_ms(self) -> float: return self.total_ms / self.frames if self.frames else 0.0
[docs] @property def fps(self) -> float: return 1000.0 / self.avg_frame_ms if self.avg_frame_ms > 0 else 0.0
[docs] @property def per_object_us(self) -> float: """Microseconds per object per frame.""" return (self.avg_frame_ms * 1000.0 / self.count) if self.count else 0.0
def _percentile(self, q: float) -> float: if not self.samples_ms: return 0.0 ordered = sorted(self.samples_ms) idx = min(len(ordered) - 1, int(len(ordered) * q)) return ordered[idx]
[docs] @property def p50_ms(self) -> float: return self._percentile(0.50)
[docs] @property def p95_ms(self) -> float: return self._percentile(0.95)
[docs] @property def p99_ms(self) -> float: return self._percentile(0.99)
[docs] @property def min_ms(self) -> float: return min(self.samples_ms) if self.samples_ms else 0.0
[docs] @property def max_ms(self) -> float: return max(self.samples_ms) if self.samples_ms else 0.0
[docs] def metric_value(self) -> float: """The headline value the baseline comparison is made against.""" return self.value if self.value is not None else self.avg_frame_ms
[docs] def report(self) -> str: tag = f" [{self.backend}]" if self.backend != "default" else "" lines = [f" {self.name}{tag} (n={self.count:,}):"] if self.metric == "frame_ms": lines.append(f" avg frame: {self.avg_frame_ms:.2f} ms ({self.fps:.0f} FPS)") if self.samples_ms: lines.append(f" p50/p95/p99: {self.p50_ms:.2f} / {self.p95_ms:.2f} / {self.p99_ms:.2f} ms") if self.count: lines.append(f" per object: {self.per_object_us:.2f} µs") else: lines.append(f" {self.metric}: {self.metric_value():,.2f}") if self.gpu_ms is not None: lines.append(f" gpu: {self.gpu_ms:.2f} ms/frame") for k, v in self.extra.items(): lines.append(f" {k}: {v:,.2f}") return "\n".join(lines)
# ============================================================================ # Measurement helpers # ============================================================================
[docs] def bench_scene_runner(root, frames: int = 120, draw: bool = False, *, backend: str = "default") -> BenchmarkResult: """Benchmark a scene with :class:`SceneRunner` (CPU only, no GPU).""" runner = SceneRunner(screen_size=(1280, 720)) runner.load(root) runner.advance_frames(2, draw=draw) # let ready() settle timer = FrameTimer() samples: list[float] = [] for _ in range(frames): timer.begin_frame() runner.advance_frames(1, draw=draw) timer.end_frame() samples.append(timer._times[-1] * 1000.0) count = getattr(root, "_bench_count", 0) or NodeCounter.total(runner.root) return BenchmarkResult( name=type(root).__name__, count=count, total_ms=sum(samples), frames=frames, samples_ms=samples, backend=backend, )
[docs] def bench_headless_render( scene_or_cls, frames: int = 60, width: int = 1280, height: int = 720, *, warmup: int = 0, telemetry_keys: tuple[str, ...] = (), backend: str = "default", **kwargs, ) -> BenchmarkResult: """Benchmark a scene with full Vulkan rendering (headless). Lazy-imports ``simvx.graphics`` so the core package stays render-agnostic. ``telemetry_keys`` names entries from ``app.last_telemetry`` to copy into ``result.extra`` (e.g. ``"occlusion_drawn"``, ``"transform_high_water"``). """ from simvx.graphics import App app = App(title="bench", width=width, height=height, visible=False) scene = scene_or_cls(**kwargs) if isinstance(scene_or_cls, type) else scene_or_cls timings: list[float] = [] app.run_headless( scene, frames=frames + 1, on_frame=lambda _idx, _t: timings.append(time.perf_counter()), capture_frames=[], ) frame_ms = [(timings[i] - timings[i - 1]) * 1000.0 for i in range(1, len(timings))] if warmup and len(frame_ms) > warmup + 1: frame_ms = frame_ms[warmup:] name = type(scene).__name__ # Prefer the scene's declared object count; fall back to common kwargs. count = getattr(scene, "_bench_count", 0) or kwargs.get("n") or kwargs.get("count") or kwargs.get("amount") or 0 telemetry = getattr(app, "last_telemetry", {}) or {} gpu_phase = telemetry.get("gpu_phase_times") or {} gpu_ms = float(sum(gpu_phase.values())) if gpu_phase else None extra = {k: float(telemetry[k]) for k in telemetry_keys if k in telemetry} return BenchmarkResult( name=name, count=int(count), total_ms=sum(frame_ms), frames=len(frame_ms), samples_ms=frame_ms, gpu_ms=gpu_ms, backend=backend, extra=extra, )
# ============================================================================ # Machine metadata # ============================================================================
[docs] @dataclass class MachineInfo: """Host + interpreter + GPU snapshot stamped onto every record.""" host: str os: str cpu: str cpu_count: int ram_gb: float python: str free_threaded: bool gpu: str = "unknown" git_commit: str = "unknown"
[docs] @classmethod def capture(cls) -> MachineInfo: is_gil_enabled = getattr(sys, "_is_gil_enabled", None) free_threaded = bool(is_gil_enabled is not None and not is_gil_enabled()) try: ram_gb = round(os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") / 1e9, 1) except ValueError, AttributeError, OSError: ram_gb = 0.0 return cls( host=platform.node() or "unknown-host", os=platform.platform(), cpu=platform.processor() or platform.machine() or "unknown-cpu", cpu_count=os.cpu_count() or 0, ram_gb=ram_gb, python=platform.python_version() + ("t" if free_threaded else ""), free_threaded=free_threaded, git_commit=_git_short_commit(), )
[docs] def to_dict(self) -> dict[str, Any]: return { "host": self.host, "os": self.os, "cpu": self.cpu, "cpu_count": self.cpu_count, "ram_gb": self.ram_gb, "python": self.python, "free_threaded": self.free_threaded, "gpu": self.gpu, "git_commit": self.git_commit, }
def _git_short_commit() -> str: try: out = subprocess.run( ["git", "rev-parse", "--short", "HEAD"], capture_output=True, text=True, timeout=5, ) if out.returncode == 0: return out.stdout.strip() or "unknown" except OSError, subprocess.SubprocessError: pass return "unknown" def _repo_root() -> Path: try: out = subprocess.run( ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True, timeout=5, ) if out.returncode == 0 and out.stdout.strip(): return Path(out.stdout.strip()) except OSError, subprocess.SubprocessError: pass return Path.cwd()
[docs] def perf_dir() -> Path: """Resolve the local perf directory (``$SIMVX_PERF_DIR`` or ``<repo>/.perf``).""" env = os.environ.get("SIMVX_PERF_DIR") return Path(env) if env else _repo_root() / ".perf"
# ============================================================================ # History + baselines # ============================================================================
[docs] def record_key(suite: str, name: str, backend: str, count: int, metric: str) -> str: """Stable identity for a measured point across runs.""" return f"{suite}::{name}::{backend}::{count}::{metric}"
[docs] @dataclass class PerfRecord: """A flattened, JSON-serialisable benchmark record (one JSONL line).""" ts: str suite: str name: str backend: str count: int metric: str value: float lower_is_better: bool avg_frame_ms: float fps: float p95_ms: float p99_ms: float gpu_ms: float | None extra: dict[str, float] machine: dict[str, Any]
[docs] @property def key(self) -> str: return record_key(self.suite, self.name, self.backend, self.count, self.metric)
[docs] @classmethod def from_result(cls, suite: str, result: BenchmarkResult, machine: MachineInfo) -> PerfRecord: return cls( ts=datetime.now(UTC).isoformat(timespec="seconds"), suite=suite, name=result.name, backend=result.backend, count=result.count, metric=result.metric, value=result.metric_value(), lower_is_better=result.lower_is_better, avg_frame_ms=result.avg_frame_ms, fps=result.fps, p95_ms=result.p95_ms, p99_ms=result.p99_ms, gpu_ms=result.gpu_ms, extra=dict(result.extra), machine=machine.to_dict(), )
[docs] def to_dict(self) -> dict[str, Any]: return { "ts": self.ts, "suite": self.suite, "name": self.name, "backend": self.backend, "count": self.count, "metric": self.metric, "value": self.value, "lower_is_better": self.lower_is_better, "avg_frame_ms": self.avg_frame_ms, "fps": self.fps, "p95_ms": self.p95_ms, "p99_ms": self.p99_ms, "gpu_ms": self.gpu_ms, "extra": self.extra, "machine": self.machine, }
[docs] class HistoryStore: """Append-only per-machine benchmark history with pinned baselines. Layout under :func:`perf_dir`:: history/<host>/<suite>.jsonl one appended line per measured point baseline/<host>.json {record_key: record_dict} pinned baseline """ def __init__(self, root: Path | None = None, machine: MachineInfo | None = None) -> None: self.root = root or perf_dir() self.machine = machine or MachineInfo.capture() def _history_path(self, suite: str) -> Path: return self.root / "history" / self.machine.host / f"{suite}.jsonl" def _baseline_path(self) -> Path: return self.root / "baseline" / f"{self.machine.host}.json"
[docs] def append(self, record: PerfRecord) -> None: path = self._history_path(record.suite) path.parent.mkdir(parents=True, exist_ok=True) with path.open("a", encoding="utf-8") as fh: fh.write(json.dumps(record.to_dict()) + "\n")
[docs] def baselines(self) -> dict[str, dict[str, Any]]: path = self._baseline_path() if not path.exists(): return {} try: return json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError, OSError: return {}
[docs] def baseline(self, key: str) -> dict[str, Any] | None: return self.baselines().get(key)
[docs] def update_baselines(self, records: dict[str, dict[str, Any]]) -> None: """Merge ``records`` (key -> record dict) into the pinned baseline file.""" current = self.baselines() current.update(records) path = self._baseline_path() path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(current, indent=2, sort_keys=True), encoding="utf-8")
# ============================================================================ # Comparison + recorder # ============================================================================
[docs] def compare(current: float, baseline: float, lower_is_better: bool, tol: float = DEFAULT_TOLERANCE) -> str: """Classify ``current`` against ``baseline`` -> ok / regressed / improved / new.""" if baseline <= 0: return "new" ratio = current / baseline if lower_is_better: if ratio > 1 + tol: return "regressed" if ratio < 1 - tol: return "improved" else: if ratio < 1 - tol: return "regressed" if ratio > 1 + tol: return "improved" return "ok"
[docs] class PerfRecorder: """Records benchmark results, compares to the machine baseline, gates on drift. A perf test takes the ``perf_recorder`` fixture and calls :meth:`record` once per measured point. On fixture teardown :meth:`finish` runs: it pins the baseline when ``--update-perf-baseline`` was given, and otherwise raises if any metric regressed beyond tolerance (unless ``--perf-report-only``). The first run on a machine has no baseline, so points are seeded and pass. """ def __init__( self, suite: str, *, store: HistoryStore | None = None, tol: float = DEFAULT_TOLERANCE, update_baseline: bool = False, report_only: bool = False, emit: Callable[[str], None] = print, ) -> None: self.suite = suite self.store = store or HistoryStore() self.tol = tol self.update_baseline = update_baseline self.report_only = report_only self._emit = emit self._pending_baseline: dict[str, dict[str, Any]] = {} self.regressions: list[tuple[str, float, float]] = []
[docs] def record(self, result: BenchmarkResult, *, tol: float | None = None) -> str: rec = PerfRecord.from_result(self.suite, result, self.store.machine) self.store.append(rec) self._pending_baseline[rec.key] = rec.to_dict() base = None if self.update_baseline else self.store.baseline(rec.key) if base is None: verdict = "seeded" if self.update_baseline else "new" else: verdict = compare(rec.value, base["value"], rec.lower_is_better, tol or self.tol) self._emit(f"{result.report()}\n baseline: {verdict}" + _delta_str(rec, base)) if verdict == "regressed": self.regressions.append((rec.key, rec.value, base["value"] if base else 0.0)) return verdict
[docs] def finish(self) -> None: if self.update_baseline and self._pending_baseline: self.store.update_baselines(self._pending_baseline) self._emit(f"perf: pinned {len(self._pending_baseline)} baseline point(s) for {self.store.machine.host}") return if self.regressions and not self.report_only: detail = ", ".join( f"{key} {cur:,.2f} vs baseline {base:,.2f} ({cur / base:.2f}×)" for key, cur, base in self.regressions if base ) raise AssertionError(f"performance regressed beyond {self.tol:.0%}: {detail}")
def _delta_str(rec: PerfRecord, base: dict[str, Any] | None) -> str: if not base or base.get("value", 0) <= 0: return "" return f" ({rec.value / base['value']:.2f}× baseline)"
[docs] def make_perf_recorder( suite: str, *, update_baseline: bool = False, report_only: bool = False, tol: float = DEFAULT_TOLERANCE, ) -> PerfRecorder: """Construct a :class:`PerfRecorder` (used by the ``perf_recorder`` fixture).""" return PerfRecorder(suite, update_baseline=update_baseline, report_only=report_only, tol=tol)
[docs] def perf_pytest_addoption(parser) -> None: """Register the shared perf CLI options on a pytest parser. Conftests call this so every package exposes the same flags. Guarded so a second call (e.g. multiple conftests under one root) does not error. """ group = parser.getgroup("perf", "performance regression benchmarks") for name, helptext in ( ("--update-perf-baseline", "Pin this run's perf results as the machine baseline"), ("--perf-report-only", "Record + report perf drift but never fail the suite"), ): try: group.addoption(name, action="store_true", default=False, help=helptext) except ValueError: pass # already registered by another conftest try: group.addoption( "--perf-tolerance", type=float, default=DEFAULT_TOLERANCE, help=f"Regression tolerance fraction (default {DEFAULT_TOLERANCE})", ) except ValueError: pass