"""Performance-benchmark harness for SimVX's opt-in regression suite.
This module is the single home for the benchmark plumbing shared by every
``@pytest.mark.perf`` suite (across packages) and by ``tools/run_benchmarks.py``:
* :class:`BenchmarkResult` -- one measured data point (frame timing + metadata).
* :func:`bench_scene_runner` / :func:`bench_headless_render` -- run a scene and
measure it (CPU-only via :class:`~simvx.core.testing.SceneRunner`, or a full
headless Vulkan render; the latter lazy-imports ``simvx.graphics``).
* :class:`MachineInfo` -- host/CPU/GPU/interpreter snapshot stamped onto records.
* :class:`HistoryStore` -- append-only per-machine history + pinned baselines.
* :class:`PerfRecorder` -- records results, compares against the machine's own
baseline, and (outside report-only mode) fails when a metric regresses.
These utilities measure **speed only**: correctness lives in the normal suites.
Results are stored locally under the perf directory (:func:`perf_dir`) and are
never committed -- absolute numbers are machine-specific, so regressions are
judged against this machine's own recorded baseline, not a hardcoded threshold.
"""
from __future__ import annotations
import json
import os
import platform
import subprocess
import sys
import time
from collections.abc import Callable
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
from .diagnostics import FrameTimer, NodeCounter
from .scene_runner import SceneRunner
__all__ = [
"BenchmarkResult",
"HistoryStore",
"MachineInfo",
"PerfRecord",
"PerfRecorder",
"bench_headless_render",
"bench_scene_runner",
"compare",
"make_perf_recorder",
"perf_dir",
"perf_pytest_addoption",
"record_key",
]
# Default regression tolerance: a metric must move more than this fraction
# against the baseline to count as a regression/improvement (else "ok").
DEFAULT_TOLERANCE = 0.20
# ============================================================================
# Result record
# ============================================================================
[docs]
@dataclass
class BenchmarkResult:
"""One measured benchmark point.
The headline metric is :meth:`metric_value`. By default it is the average
frame time in ms (``metric="frame_ms"``, lower is better). A limit-style
bench (e.g. "max sprites at 60 FPS") sets ``metric``, ``value`` and
``lower_is_better=False`` so the comparison treats higher as better.
"""
name: str
count: int = 0
total_ms: float = 0.0
frames: int = 0
# Per-frame wall-clock samples (ms); drives percentile properties.
samples_ms: list[float] = field(default_factory=list)
# Mean GPU time per frame (ms) from app telemetry, when rendering.
gpu_ms: float | None = None
backend: str = "default"
metric: str = "frame_ms"
# Explicit headline value for non-frame-time metrics (e.g. an object count).
value: float | None = None
lower_is_better: bool = True
extra: dict[str, float] = field(default_factory=dict)
[docs]
@property
def avg_frame_ms(self) -> float:
return self.total_ms / self.frames if self.frames else 0.0
[docs]
@property
def fps(self) -> float:
return 1000.0 / self.avg_frame_ms if self.avg_frame_ms > 0 else 0.0
[docs]
@property
def per_object_us(self) -> float:
"""Microseconds per object per frame."""
return (self.avg_frame_ms * 1000.0 / self.count) if self.count else 0.0
def _percentile(self, q: float) -> float:
if not self.samples_ms:
return 0.0
ordered = sorted(self.samples_ms)
idx = min(len(ordered) - 1, int(len(ordered) * q))
return ordered[idx]
[docs]
@property
def p50_ms(self) -> float:
return self._percentile(0.50)
[docs]
@property
def p95_ms(self) -> float:
return self._percentile(0.95)
[docs]
@property
def p99_ms(self) -> float:
return self._percentile(0.99)
[docs]
@property
def min_ms(self) -> float:
return min(self.samples_ms) if self.samples_ms else 0.0
[docs]
@property
def max_ms(self) -> float:
return max(self.samples_ms) if self.samples_ms else 0.0
[docs]
def metric_value(self) -> float:
"""The headline value the baseline comparison is made against."""
return self.value if self.value is not None else self.avg_frame_ms
[docs]
def report(self) -> str:
tag = f" [{self.backend}]" if self.backend != "default" else ""
lines = [f" {self.name}{tag} (n={self.count:,}):"]
if self.metric == "frame_ms":
lines.append(f" avg frame: {self.avg_frame_ms:.2f} ms ({self.fps:.0f} FPS)")
if self.samples_ms:
lines.append(f" p50/p95/p99: {self.p50_ms:.2f} / {self.p95_ms:.2f} / {self.p99_ms:.2f} ms")
if self.count:
lines.append(f" per object: {self.per_object_us:.2f} µs")
else:
lines.append(f" {self.metric}: {self.metric_value():,.2f}")
if self.gpu_ms is not None:
lines.append(f" gpu: {self.gpu_ms:.2f} ms/frame")
for k, v in self.extra.items():
lines.append(f" {k}: {v:,.2f}")
return "\n".join(lines)
# ============================================================================
# Measurement helpers
# ============================================================================
[docs]
def bench_scene_runner(root, frames: int = 120, draw: bool = False, *, backend: str = "default") -> BenchmarkResult:
"""Benchmark a scene with :class:`SceneRunner` (CPU only, no GPU)."""
runner = SceneRunner(screen_size=(1280, 720))
runner.load(root)
runner.advance_frames(2, draw=draw) # let ready() settle
timer = FrameTimer()
samples: list[float] = []
for _ in range(frames):
timer.begin_frame()
runner.advance_frames(1, draw=draw)
timer.end_frame()
samples.append(timer._times[-1] * 1000.0)
count = getattr(root, "_bench_count", 0) or NodeCounter.total(runner.root)
return BenchmarkResult(
name=type(root).__name__,
count=count,
total_ms=sum(samples),
frames=frames,
samples_ms=samples,
backend=backend,
)
[docs]
def bench_headless_render(
scene_or_cls,
frames: int = 60,
width: int = 1280,
height: int = 720,
*,
warmup: int = 0,
telemetry_keys: tuple[str, ...] = (),
backend: str = "default",
**kwargs,
) -> BenchmarkResult:
"""Benchmark a scene with full Vulkan rendering (headless).
Lazy-imports ``simvx.graphics`` so the core package stays render-agnostic.
``telemetry_keys`` names entries from ``app.last_telemetry`` to copy into
``result.extra`` (e.g. ``"occlusion_drawn"``, ``"transform_high_water"``).
"""
from simvx.graphics import App
app = App(title="bench", width=width, height=height, visible=False)
scene = scene_or_cls(**kwargs) if isinstance(scene_or_cls, type) else scene_or_cls
timings: list[float] = []
app.run_headless(
scene,
frames=frames + 1,
on_frame=lambda _idx, _t: timings.append(time.perf_counter()),
capture_frames=[],
)
frame_ms = [(timings[i] - timings[i - 1]) * 1000.0 for i in range(1, len(timings))]
if warmup and len(frame_ms) > warmup + 1:
frame_ms = frame_ms[warmup:]
name = type(scene).__name__
# Prefer the scene's declared object count; fall back to common kwargs.
count = getattr(scene, "_bench_count", 0) or kwargs.get("n") or kwargs.get("count") or kwargs.get("amount") or 0
telemetry = getattr(app, "last_telemetry", {}) or {}
gpu_phase = telemetry.get("gpu_phase_times") or {}
gpu_ms = float(sum(gpu_phase.values())) if gpu_phase else None
extra = {k: float(telemetry[k]) for k in telemetry_keys if k in telemetry}
return BenchmarkResult(
name=name,
count=int(count),
total_ms=sum(frame_ms),
frames=len(frame_ms),
samples_ms=frame_ms,
gpu_ms=gpu_ms,
backend=backend,
extra=extra,
)
# ============================================================================
# Machine metadata
# ============================================================================
[docs]
@dataclass
class MachineInfo:
"""Host + interpreter + GPU snapshot stamped onto every record."""
host: str
os: str
cpu: str
cpu_count: int
ram_gb: float
python: str
free_threaded: bool
gpu: str = "unknown"
git_commit: str = "unknown"
[docs]
@classmethod
def capture(cls) -> MachineInfo:
is_gil_enabled = getattr(sys, "_is_gil_enabled", None)
free_threaded = bool(is_gil_enabled is not None and not is_gil_enabled())
try:
ram_gb = round(os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") / 1e9, 1)
except ValueError, AttributeError, OSError:
ram_gb = 0.0
return cls(
host=platform.node() or "unknown-host",
os=platform.platform(),
cpu=platform.processor() or platform.machine() or "unknown-cpu",
cpu_count=os.cpu_count() or 0,
ram_gb=ram_gb,
python=platform.python_version() + ("t" if free_threaded else ""),
free_threaded=free_threaded,
git_commit=_git_short_commit(),
)
[docs]
def to_dict(self) -> dict[str, Any]:
return {
"host": self.host,
"os": self.os,
"cpu": self.cpu,
"cpu_count": self.cpu_count,
"ram_gb": self.ram_gb,
"python": self.python,
"free_threaded": self.free_threaded,
"gpu": self.gpu,
"git_commit": self.git_commit,
}
def _git_short_commit() -> str:
try:
out = subprocess.run(
["git", "rev-parse", "--short", "HEAD"],
capture_output=True,
text=True,
timeout=5,
)
if out.returncode == 0:
return out.stdout.strip() or "unknown"
except OSError, subprocess.SubprocessError:
pass
return "unknown"
def _repo_root() -> Path:
try:
out = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
capture_output=True,
text=True,
timeout=5,
)
if out.returncode == 0 and out.stdout.strip():
return Path(out.stdout.strip())
except OSError, subprocess.SubprocessError:
pass
return Path.cwd()
[docs]
def perf_dir() -> Path:
"""Resolve the local perf directory (``$SIMVX_PERF_DIR`` or ``<repo>/.perf``)."""
env = os.environ.get("SIMVX_PERF_DIR")
return Path(env) if env else _repo_root() / ".perf"
# ============================================================================
# History + baselines
# ============================================================================
[docs]
def record_key(suite: str, name: str, backend: str, count: int, metric: str) -> str:
"""Stable identity for a measured point across runs."""
return f"{suite}::{name}::{backend}::{count}::{metric}"
[docs]
@dataclass
class PerfRecord:
"""A flattened, JSON-serialisable benchmark record (one JSONL line)."""
ts: str
suite: str
name: str
backend: str
count: int
metric: str
value: float
lower_is_better: bool
avg_frame_ms: float
fps: float
p95_ms: float
p99_ms: float
gpu_ms: float | None
extra: dict[str, float]
machine: dict[str, Any]
[docs]
@property
def key(self) -> str:
return record_key(self.suite, self.name, self.backend, self.count, self.metric)
[docs]
@classmethod
def from_result(cls, suite: str, result: BenchmarkResult, machine: MachineInfo) -> PerfRecord:
return cls(
ts=datetime.now(UTC).isoformat(timespec="seconds"),
suite=suite,
name=result.name,
backend=result.backend,
count=result.count,
metric=result.metric,
value=result.metric_value(),
lower_is_better=result.lower_is_better,
avg_frame_ms=result.avg_frame_ms,
fps=result.fps,
p95_ms=result.p95_ms,
p99_ms=result.p99_ms,
gpu_ms=result.gpu_ms,
extra=dict(result.extra),
machine=machine.to_dict(),
)
[docs]
def to_dict(self) -> dict[str, Any]:
return {
"ts": self.ts,
"suite": self.suite,
"name": self.name,
"backend": self.backend,
"count": self.count,
"metric": self.metric,
"value": self.value,
"lower_is_better": self.lower_is_better,
"avg_frame_ms": self.avg_frame_ms,
"fps": self.fps,
"p95_ms": self.p95_ms,
"p99_ms": self.p99_ms,
"gpu_ms": self.gpu_ms,
"extra": self.extra,
"machine": self.machine,
}
[docs]
class HistoryStore:
"""Append-only per-machine benchmark history with pinned baselines.
Layout under :func:`perf_dir`::
history/<host>/<suite>.jsonl one appended line per measured point
baseline/<host>.json {record_key: record_dict} pinned baseline
"""
def __init__(self, root: Path | None = None, machine: MachineInfo | None = None) -> None:
self.root = root or perf_dir()
self.machine = machine or MachineInfo.capture()
def _history_path(self, suite: str) -> Path:
return self.root / "history" / self.machine.host / f"{suite}.jsonl"
def _baseline_path(self) -> Path:
return self.root / "baseline" / f"{self.machine.host}.json"
[docs]
def append(self, record: PerfRecord) -> None:
path = self._history_path(record.suite)
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("a", encoding="utf-8") as fh:
fh.write(json.dumps(record.to_dict()) + "\n")
[docs]
def baselines(self) -> dict[str, dict[str, Any]]:
path = self._baseline_path()
if not path.exists():
return {}
try:
return json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError, OSError:
return {}
[docs]
def baseline(self, key: str) -> dict[str, Any] | None:
return self.baselines().get(key)
[docs]
def update_baselines(self, records: dict[str, dict[str, Any]]) -> None:
"""Merge ``records`` (key -> record dict) into the pinned baseline file."""
current = self.baselines()
current.update(records)
path = self._baseline_path()
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(current, indent=2, sort_keys=True), encoding="utf-8")
# ============================================================================
# Comparison + recorder
# ============================================================================
[docs]
def compare(current: float, baseline: float, lower_is_better: bool, tol: float = DEFAULT_TOLERANCE) -> str:
"""Classify ``current`` against ``baseline`` -> ok / regressed / improved / new."""
if baseline <= 0:
return "new"
ratio = current / baseline
if lower_is_better:
if ratio > 1 + tol:
return "regressed"
if ratio < 1 - tol:
return "improved"
else:
if ratio < 1 - tol:
return "regressed"
if ratio > 1 + tol:
return "improved"
return "ok"
[docs]
class PerfRecorder:
"""Records benchmark results, compares to the machine baseline, gates on drift.
A perf test takes the ``perf_recorder`` fixture and calls
:meth:`record` once per measured point. On fixture teardown
:meth:`finish` runs: it pins the baseline when ``--update-perf-baseline``
was given, and otherwise raises if any metric regressed beyond tolerance
(unless ``--perf-report-only``). The first run on a machine has no baseline,
so points are seeded and pass.
"""
def __init__(
self,
suite: str,
*,
store: HistoryStore | None = None,
tol: float = DEFAULT_TOLERANCE,
update_baseline: bool = False,
report_only: bool = False,
emit: Callable[[str], None] = print,
) -> None:
self.suite = suite
self.store = store or HistoryStore()
self.tol = tol
self.update_baseline = update_baseline
self.report_only = report_only
self._emit = emit
self._pending_baseline: dict[str, dict[str, Any]] = {}
self.regressions: list[tuple[str, float, float]] = []
[docs]
def record(self, result: BenchmarkResult, *, tol: float | None = None) -> str:
rec = PerfRecord.from_result(self.suite, result, self.store.machine)
self.store.append(rec)
self._pending_baseline[rec.key] = rec.to_dict()
base = None if self.update_baseline else self.store.baseline(rec.key)
if base is None:
verdict = "seeded" if self.update_baseline else "new"
else:
verdict = compare(rec.value, base["value"], rec.lower_is_better, tol or self.tol)
self._emit(f"{result.report()}\n baseline: {verdict}" + _delta_str(rec, base))
if verdict == "regressed":
self.regressions.append((rec.key, rec.value, base["value"] if base else 0.0))
return verdict
[docs]
def finish(self) -> None:
if self.update_baseline and self._pending_baseline:
self.store.update_baselines(self._pending_baseline)
self._emit(f"perf: pinned {len(self._pending_baseline)} baseline point(s) for {self.store.machine.host}")
return
if self.regressions and not self.report_only:
detail = ", ".join(
f"{key} {cur:,.2f} vs baseline {base:,.2f} ({cur / base:.2f}×)"
for key, cur, base in self.regressions
if base
)
raise AssertionError(f"performance regressed beyond {self.tol:.0%}: {detail}")
def _delta_str(rec: PerfRecord, base: dict[str, Any] | None) -> str:
if not base or base.get("value", 0) <= 0:
return ""
return f" ({rec.value / base['value']:.2f}× baseline)"
[docs]
def make_perf_recorder(
suite: str,
*,
update_baseline: bool = False,
report_only: bool = False,
tol: float = DEFAULT_TOLERANCE,
) -> PerfRecorder:
"""Construct a :class:`PerfRecorder` (used by the ``perf_recorder`` fixture)."""
return PerfRecorder(suite, update_baseline=update_baseline, report_only=report_only, tol=tol)
[docs]
def perf_pytest_addoption(parser) -> None:
"""Register the shared perf CLI options on a pytest parser.
Conftests call this so every package exposes the same flags. Guarded so a
second call (e.g. multiple conftests under one root) does not error.
"""
group = parser.getgroup("perf", "performance regression benchmarks")
for name, helptext in (
("--update-perf-baseline", "Pin this run's perf results as the machine baseline"),
("--perf-report-only", "Record + report perf drift but never fail the suite"),
):
try:
group.addoption(name, action="store_true", default=False, help=helptext)
except ValueError:
pass # already registered by another conftest
try:
group.addoption(
"--perf-tolerance",
type=float,
default=DEFAULT_TOLERANCE,
help=f"Regression tolerance fraction (default {DEFAULT_TOLERANCE})",
)
except ValueError:
pass