Source code for simvx.core.testing.benchmark

"""Performance-benchmark harness for SimVX's opt-in regression suite.

This module is the single home for the benchmark plumbing shared by every
``@pytest.mark.perf`` suite (across packages) and by ``tools/run_benchmarks.py``:

* :class:`BenchmarkResult` -- one measured data point (frame timing + metadata).
* :func:`bench_scene_runner` / :func:`bench_headless_render` -- run a scene and
  measure it (CPU-only via :class:`~simvx.core.testing.SceneRunner`, or a full
  headless Vulkan render; the latter lazy-imports ``simvx.graphics``).
* :class:`MachineInfo` -- host/CPU/GPU/interpreter snapshot stamped onto records.
* :class:`HistoryStore` -- append-only per-machine history + pinned baselines.
* :class:`PerfRecorder` -- records results, compares against the machine's own
  baseline, and (outside report-only mode) fails when a metric regresses.

These utilities measure **speed only**: correctness lives in the normal suites.
Results are stored locally under the perf directory (:func:`perf_dir`) and are
never committed -- absolute numbers are machine-specific, so regressions are
judged against this machine's own recorded baseline, not a hardcoded threshold.
"""

from __future__ import annotations

import json
import os
import platform
import subprocess
import sys
import time
from collections.abc import Callable
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
from typing import Any

from .diagnostics import FrameTimer, NodeCounter
from .scene_runner import SceneRunner

__all__ = [
    "BenchmarkResult",
    "HistoryStore",
    "MachineInfo",
    "PerfRecord",
    "PerfRecorder",
    "bench_headless_render",
    "bench_scene_runner",
    "compare",
    "make_perf_recorder",
    "perf_dir",
    "perf_pytest_addoption",
    "record_key",
]

# Default regression tolerance: a metric must move more than this fraction
# against the baseline to count as a regression/improvement (else "ok").
DEFAULT_TOLERANCE = 0.20


# ============================================================================
# Result record
# ============================================================================



[docs]
@dataclass
class BenchmarkResult:
    """One measured benchmark point.

    The headline metric is :meth:`metric_value`. By default it is the average
    frame time in ms (``metric="frame_ms"``, lower is better). A limit-style
    bench (e.g. "max sprites at 60 FPS") sets ``metric``, ``value`` and
    ``lower_is_better=False`` so the comparison treats higher as better.
    """

    name: str
    count: int = 0
    total_ms: float = 0.0
    frames: int = 0
    # Per-frame wall-clock samples (ms); drives percentile properties.
    samples_ms: list[float] = field(default_factory=list)
    # Mean GPU time per frame (ms) from app telemetry, when rendering.
    gpu_ms: float | None = None
    backend: str = "default"
    metric: str = "frame_ms"
    # Explicit headline value for non-frame-time metrics (e.g. an object count).
    value: float | None = None
    lower_is_better: bool = True
    extra: dict[str, float] = field(default_factory=dict)


[docs]
    @property
    def avg_frame_ms(self) -> float:
        return self.total_ms / self.frames if self.frames else 0.0



[docs]
    @property
    def fps(self) -> float:
        return 1000.0 / self.avg_frame_ms if self.avg_frame_ms > 0 else 0.0



[docs]
    @property
    def per_object_us(self) -> float:
        """Microseconds per object per frame."""
        return (self.avg_frame_ms * 1000.0 / self.count) if self.count else 0.0


    def _percentile(self, q: float) -> float:
        if not self.samples_ms:
            return 0.0
        ordered = sorted(self.samples_ms)
        idx = min(len(ordered) - 1, int(len(ordered) * q))
        return ordered[idx]


[docs]
    @property
    def p50_ms(self) -> float:
        return self._percentile(0.50)



[docs]
    @property
    def p95_ms(self) -> float:
        return self._percentile(0.95)



[docs]
    @property
    def p99_ms(self) -> float:
        return self._percentile(0.99)



[docs]
    @property
    def min_ms(self) -> float:
        return min(self.samples_ms) if self.samples_ms else 0.0



[docs]
    @property
    def max_ms(self) -> float:
        return max(self.samples_ms) if self.samples_ms else 0.0



[docs]
    def metric_value(self) -> float:
        """The headline value the baseline comparison is made against."""
        return self.value if self.value is not None else self.avg_frame_ms



[docs]
    def report(self) -> str:
        tag = f" [{self.backend}]" if self.backend != "default" else ""
        lines = [f"  {self.name}{tag} (n={self.count:,}):"]
        if self.metric == "frame_ms":
            lines.append(f"    avg frame: {self.avg_frame_ms:.2f} ms  ({self.fps:.0f} FPS)")
            if self.samples_ms:
                lines.append(f"    p50/p95/p99: {self.p50_ms:.2f} / {self.p95_ms:.2f} / {self.p99_ms:.2f} ms")
            if self.count:
                lines.append(f"    per object: {self.per_object_us:.2f} µs")
        else:
            lines.append(f"    {self.metric}: {self.metric_value():,.2f}")
        if self.gpu_ms is not None:
            lines.append(f"    gpu: {self.gpu_ms:.2f} ms/frame")
        for k, v in self.extra.items():
            lines.append(f"    {k}: {v:,.2f}")
        return "\n".join(lines)




# ============================================================================
# Measurement helpers
# ============================================================================



[docs]
def bench_scene_runner(root, frames: int = 120, draw: bool = False, *, backend: str = "default") -> BenchmarkResult:
    """Benchmark a scene with :class:`SceneRunner` (CPU only, no GPU)."""
    runner = SceneRunner(screen_size=(1280, 720))
    runner.load(root)
    runner.advance_frames(2, draw=draw)  # let ready() settle

    timer = FrameTimer()
    samples: list[float] = []
    for _ in range(frames):
        timer.begin_frame()
        runner.advance_frames(1, draw=draw)
        timer.end_frame()
        samples.append(timer._times[-1] * 1000.0)

    count = getattr(root, "_bench_count", 0) or NodeCounter.total(runner.root)
    return BenchmarkResult(
        name=type(root).__name__,
        count=count,
        total_ms=sum(samples),
        frames=frames,
        samples_ms=samples,
        backend=backend,
    )




[docs]
def bench_headless_render(
    scene_or_cls,
    frames: int = 60,
    width: int = 1280,
    height: int = 720,
    *,
    warmup: int = 0,
    telemetry_keys: tuple[str, ...] = (),
    backend: str = "default",
    **kwargs,
) -> BenchmarkResult:
    """Benchmark a scene with full Vulkan rendering (headless).

    Lazy-imports ``simvx.graphics`` so the core package stays render-agnostic.
    ``telemetry_keys`` names entries from ``app.last_telemetry`` to copy into
    ``result.extra`` (e.g. ``"occlusion_drawn"``, ``"transform_high_water"``).
    """
    from simvx.graphics import App

    app = App(title="bench", width=width, height=height, visible=False)
    scene = scene_or_cls(**kwargs) if isinstance(scene_or_cls, type) else scene_or_cls

    timings: list[float] = []
    app.run_headless(
        scene,
        frames=frames + 1,
        on_frame=lambda _idx, _t: timings.append(time.perf_counter()),
        capture_frames=[],
    )

    frame_ms = [(timings[i] - timings[i - 1]) * 1000.0 for i in range(1, len(timings))]
    if warmup and len(frame_ms) > warmup + 1:
        frame_ms = frame_ms[warmup:]

    name = type(scene).__name__
    # Prefer the scene's declared object count; fall back to common kwargs.
    count = getattr(scene, "_bench_count", 0) or kwargs.get("n") or kwargs.get("count") or kwargs.get("amount") or 0

    telemetry = getattr(app, "last_telemetry", {}) or {}
    gpu_phase = telemetry.get("gpu_phase_times") or {}
    gpu_ms = float(sum(gpu_phase.values())) if gpu_phase else None
    extra = {k: float(telemetry[k]) for k in telemetry_keys if k in telemetry}

    return BenchmarkResult(
        name=name,
        count=int(count),
        total_ms=sum(frame_ms),
        frames=len(frame_ms),
        samples_ms=frame_ms,
        gpu_ms=gpu_ms,
        backend=backend,
        extra=extra,
    )



# ============================================================================
# Machine metadata
# ============================================================================



[docs]
@dataclass
class MachineInfo:
    """Host + interpreter + GPU snapshot stamped onto every record."""

    host: str
    os: str
    cpu: str
    cpu_count: int
    ram_gb: float
    python: str
    free_threaded: bool
    gpu: str = "unknown"
    git_commit: str = "unknown"


[docs]
    @classmethod
    def capture(cls) -> MachineInfo:
        is_gil_enabled = getattr(sys, "_is_gil_enabled", None)
        free_threaded = bool(is_gil_enabled is not None and not is_gil_enabled())
        try:
            ram_gb = round(os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES") / 1e9, 1)
        except ValueError, AttributeError, OSError:
            ram_gb = 0.0
        return cls(
            host=platform.node() or "unknown-host",
            os=platform.platform(),
            cpu=platform.processor() or platform.machine() or "unknown-cpu",
            cpu_count=os.cpu_count() or 0,
            ram_gb=ram_gb,
            python=platform.python_version() + ("t" if free_threaded else ""),
            free_threaded=free_threaded,
            git_commit=_git_short_commit(),
        )



[docs]
    def to_dict(self) -> dict[str, Any]:
        return {
            "host": self.host,
            "os": self.os,
            "cpu": self.cpu,
            "cpu_count": self.cpu_count,
            "ram_gb": self.ram_gb,
            "python": self.python,
            "free_threaded": self.free_threaded,
            "gpu": self.gpu,
            "git_commit": self.git_commit,
        }




def _git_short_commit() -> str:
    try:
        out = subprocess.run(
            ["git", "rev-parse", "--short", "HEAD"],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if out.returncode == 0:
            return out.stdout.strip() or "unknown"
    except OSError, subprocess.SubprocessError:
        pass
    return "unknown"


def _repo_root() -> Path:
    try:
        out = subprocess.run(
            ["git", "rev-parse", "--show-toplevel"],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if out.returncode == 0 and out.stdout.strip():
            return Path(out.stdout.strip())
    except OSError, subprocess.SubprocessError:
        pass
    return Path.cwd()



[docs]
def perf_dir() -> Path:
    """Resolve the local perf directory (``$SIMVX_PERF_DIR`` or ``<repo>/.perf``)."""
    env = os.environ.get("SIMVX_PERF_DIR")
    return Path(env) if env else _repo_root() / ".perf"



# ============================================================================
# History + baselines
# ============================================================================



[docs]
def record_key(suite: str, name: str, backend: str, count: int, metric: str) -> str:
    """Stable identity for a measured point across runs."""
    return f"{suite}::{name}::{backend}::{count}::{metric}"




[docs]
@dataclass
class PerfRecord:
    """A flattened, JSON-serialisable benchmark record (one JSONL line)."""

    ts: str
    suite: str
    name: str
    backend: str
    count: int
    metric: str
    value: float
    lower_is_better: bool
    avg_frame_ms: float
    fps: float
    p95_ms: float
    p99_ms: float
    gpu_ms: float | None
    extra: dict[str, float]
    machine: dict[str, Any]


[docs]
    @property
    def key(self) -> str:
        return record_key(self.suite, self.name, self.backend, self.count, self.metric)



[docs]
    @classmethod
    def from_result(cls, suite: str, result: BenchmarkResult, machine: MachineInfo) -> PerfRecord:
        return cls(
            ts=datetime.now(UTC).isoformat(timespec="seconds"),
            suite=suite,
            name=result.name,
            backend=result.backend,
            count=result.count,
            metric=result.metric,
            value=result.metric_value(),
            lower_is_better=result.lower_is_better,
            avg_frame_ms=result.avg_frame_ms,
            fps=result.fps,
            p95_ms=result.p95_ms,
            p99_ms=result.p99_ms,
            gpu_ms=result.gpu_ms,
            extra=dict(result.extra),
            machine=machine.to_dict(),
        )



[docs]
    def to_dict(self) -> dict[str, Any]:
        return {
            "ts": self.ts,
            "suite": self.suite,
            "name": self.name,
            "backend": self.backend,
            "count": self.count,
            "metric": self.metric,
            "value": self.value,
            "lower_is_better": self.lower_is_better,
            "avg_frame_ms": self.avg_frame_ms,
            "fps": self.fps,
            "p95_ms": self.p95_ms,
            "p99_ms": self.p99_ms,
            "gpu_ms": self.gpu_ms,
            "extra": self.extra,
            "machine": self.machine,
        }





[docs]
class HistoryStore:
    """Append-only per-machine benchmark history with pinned baselines.

    Layout under :func:`perf_dir`::

        history/<host>/<suite>.jsonl   one appended line per measured point
        baseline/<host>.json           {record_key: record_dict} pinned baseline
    """

    def __init__(self, root: Path | None = None, machine: MachineInfo | None = None) -> None:
        self.root = root or perf_dir()
        self.machine = machine or MachineInfo.capture()

    def _history_path(self, suite: str) -> Path:
        return self.root / "history" / self.machine.host / f"{suite}.jsonl"

    def _baseline_path(self) -> Path:
        return self.root / "baseline" / f"{self.machine.host}.json"


[docs]
    def append(self, record: PerfRecord) -> None:
        path = self._history_path(record.suite)
        path.parent.mkdir(parents=True, exist_ok=True)
        with path.open("a", encoding="utf-8") as fh:
            fh.write(json.dumps(record.to_dict()) + "\n")



[docs]
    def baselines(self) -> dict[str, dict[str, Any]]:
        path = self._baseline_path()
        if not path.exists():
            return {}
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except json.JSONDecodeError, OSError:
            return {}



[docs]
    def baseline(self, key: str) -> dict[str, Any] | None:
        return self.baselines().get(key)



[docs]
    def update_baselines(self, records: dict[str, dict[str, Any]]) -> None:
        """Merge ``records`` (key -> record dict) into the pinned baseline file."""
        current = self.baselines()
        current.update(records)
        path = self._baseline_path()
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps(current, indent=2, sort_keys=True), encoding="utf-8")




# ============================================================================
# Comparison + recorder
# ============================================================================



[docs]
def compare(current: float, baseline: float, lower_is_better: bool, tol: float = DEFAULT_TOLERANCE) -> str:
    """Classify ``current`` against ``baseline`` -> ok / regressed / improved / new."""
    if baseline <= 0:
        return "new"
    ratio = current / baseline
    if lower_is_better:
        if ratio > 1 + tol:
            return "regressed"
        if ratio < 1 - tol:
            return "improved"
    else:
        if ratio < 1 - tol:
            return "regressed"
        if ratio > 1 + tol:
            return "improved"
    return "ok"




[docs]
class PerfRecorder:
    """Records benchmark results, compares to the machine baseline, gates on drift.

    A perf test takes the ``perf_recorder`` fixture and calls
    :meth:`record` once per measured point. On fixture teardown
    :meth:`finish` runs: it pins the baseline when ``--update-perf-baseline``
    was given, and otherwise raises if any metric regressed beyond tolerance
    (unless ``--perf-report-only``). The first run on a machine has no baseline,
    so points are seeded and pass.
    """

    def __init__(
        self,
        suite: str,
        *,
        store: HistoryStore | None = None,
        tol: float = DEFAULT_TOLERANCE,
        update_baseline: bool = False,
        report_only: bool = False,
        emit: Callable[[str], None] = print,
    ) -> None:
        self.suite = suite
        self.store = store or HistoryStore()
        self.tol = tol
        self.update_baseline = update_baseline
        self.report_only = report_only
        self._emit = emit
        self._pending_baseline: dict[str, dict[str, Any]] = {}
        self.regressions: list[tuple[str, float, float]] = []


[docs]
    def record(self, result: BenchmarkResult, *, tol: float | None = None) -> str:
        rec = PerfRecord.from_result(self.suite, result, self.store.machine)
        self.store.append(rec)
        self._pending_baseline[rec.key] = rec.to_dict()

        base = None if self.update_baseline else self.store.baseline(rec.key)
        if base is None:
            verdict = "seeded" if self.update_baseline else "new"
        else:
            verdict = compare(rec.value, base["value"], rec.lower_is_better, tol or self.tol)

        self._emit(f"{result.report()}\n    baseline: {verdict}" + _delta_str(rec, base))
        if verdict == "regressed":
            self.regressions.append((rec.key, rec.value, base["value"] if base else 0.0))
        return verdict



[docs]
    def finish(self) -> None:
        if self.update_baseline and self._pending_baseline:
            self.store.update_baselines(self._pending_baseline)
            self._emit(f"perf: pinned {len(self._pending_baseline)} baseline point(s) for {self.store.machine.host}")
            return
        if self.regressions and not self.report_only:
            detail = ", ".join(
                f"{key} {cur:,.2f} vs baseline {base:,.2f} ({cur / base:.2f}×)"
                for key, cur, base in self.regressions
                if base
            )
            raise AssertionError(f"performance regressed beyond {self.tol:.0%}: {detail}")




def _delta_str(rec: PerfRecord, base: dict[str, Any] | None) -> str:
    if not base or base.get("value", 0) <= 0:
        return ""
    return f" ({rec.value / base['value']:.2f}× baseline)"



[docs]
def make_perf_recorder(
    suite: str,
    *,
    update_baseline: bool = False,
    report_only: bool = False,
    tol: float = DEFAULT_TOLERANCE,
) -> PerfRecorder:
    """Construct a :class:`PerfRecorder` (used by the ``perf_recorder`` fixture)."""
    return PerfRecorder(suite, update_baseline=update_baseline, report_only=report_only, tol=tol)




[docs]
def perf_pytest_addoption(parser) -> None:
    """Register the shared perf CLI options on a pytest parser.

    Conftests call this so every package exposes the same flags. Guarded so a
    second call (e.g. multiple conftests under one root) does not error.
    """
    group = parser.getgroup("perf", "performance regression benchmarks")
    for name, helptext in (
        ("--update-perf-baseline", "Pin this run's perf results as the machine baseline"),
        ("--perf-report-only", "Record + report perf drift but never fail the suite"),
    ):
        try:
            group.addoption(name, action="store_true", default=False, help=helptext)
        except ValueError:
            pass  # already registered by another conftest
    try:
        group.addoption(
            "--perf-tolerance",
            type=float,
            default=DEFAULT_TOLERANCE,
            help=f"Regression tolerance fraction (default {DEFAULT_TOLERANCE})",
        )
    except ValueError:
        pass