Source code for simvx.graphics.gpu_gate

"""GPU capability gating for the test suite: a single 3-state policy.

The engine's tests fall into three buckets when a Vulkan GPU is involved, and
collapsing "could not run" into "passed" is the one outcome we never allow (a
green-but-empty GPU lane silently hides regressions). The three states:

* **AVAILABLE** -- a usable Vulkan device is present; the test runs.
* **ABSENT** -- no ``vulkan`` binding, or no Vulkan-capable device. By default the
  test **skips** (so contributors and CI hosts without a GPU stay green), unless
  ``--require-gpu`` is passed -- then absence is a **failure**, because that flag
  means the caller (the GPU CI lane, ``tools/run_benchmarks.py``) asked to
  exercise the GPU and an empty run must not look green.
* **BROKEN** -- a device is present but initialisation raises. This is a real
  defect, not an absence, so it is **always a failure**, regardless of
  ``--require-gpu``.

A test that needs a GPU is identified by the ``vulkan`` marker. The graphics
``conftest`` auto-applies that marker to any test requesting a GPU fixture
(:data:`GPU_FIXTURES`), so "uses a GPU fixture" and "is a GPU test" are the same
thing and nothing has to be remembered per test. The runtime guard
(:func:`ungated_gpu_error_hint`) catches the residual case -- a test that builds
its own device without gating -- by recognising the absent-device error.

This module holds the pure decision logic so it is unit-testable and
monkeypatchable without a GPU; the ``conftest`` hooks are thin wrappers over it.
"""

from __future__ import annotations

import enum

__all__ = [
    "GPU_FIXTURES",
    "VULKAN_MARKER",
    "GpuStatus",
    "Decision",
    "probe_gpu",
    "reset_cache",
    "needs_gpu",
    "decide",
    "apply_gate",
    "ungated_gpu_error_hint",
]

# Requesting any of these fixtures means the test drives a real Vulkan device.
GPU_FIXTURES = frozenset({"headless_app", "capture", "require_vulkan"})

VULKAN_MARKER = "vulkan"

# RuntimeError messages raised by ``select_physical_device`` (gpu/device.py) that
# mean "no usable GPU" as opposed to a broken/misconfigured driver (which raises
# something else). Kept in sync with that module.
_ABSENT_MARKERS = (
    "No Vulkan-capable GPU found",
    "No suitable GPU with graphics+present queues found",
)


[docs] class GpuStatus(enum.Enum): AVAILABLE = "available" ABSENT = "absent" BROKEN = "broken"
[docs] class Decision(enum.Enum): RUN = "run" SKIP = "skip" FAIL = "fail"
_cached: tuple[GpuStatus, str] | None = None
[docs] def reset_cache() -> None: """Forget the cached probe result (test helper).""" global _cached _cached = None
[docs] def probe_gpu() -> tuple[GpuStatus, str]: """Detect GPU status once per process. Returns ``(status, human_reason)``.""" global _cached if _cached is None: _cached = _detect() return _cached
def _detect() -> tuple[GpuStatus, str]: try: import vulkan # noqa: F401 except Exception: return (GpuStatus.ABSENT, "the 'vulkan' Python binding is not installed") try: # The Vulkan device is created inside run_headless(), NOT the App # constructor, so the probe must actually render a frame to learn whether # a usable device comes up (or how it fails). from simvx.core import Node from simvx.graphics import App App(width=64, height=64, title="gpu-probe", visible=False).run_headless(Node(name="gpu-probe"), frames=1) except RuntimeError as e: msg = str(e) if any(m in msg for m in _ABSENT_MARKERS): return (GpuStatus.ABSENT, "no Vulkan-capable GPU is present") return (GpuStatus.BROKEN, f"Vulkan device initialisation failed: {msg}") except Exception as e: # any non-RuntimeError during init = broken environment return (GpuStatus.BROKEN, f"Vulkan device initialisation raised {type(e).__name__}: {e}") return (GpuStatus.AVAILABLE, "")
[docs] def needs_gpu(fixturenames) -> bool: """True if the test requests a GPU fixture (so it should be ``vulkan``-marked).""" return bool(GPU_FIXTURES.intersection(fixturenames))
[docs] def decide(status: GpuStatus, reason: str, require_gpu: bool) -> tuple[Decision, str]: """Map ``(probe status, --require-gpu)`` to a run/skip/fail decision + message.""" if status is GpuStatus.AVAILABLE: return (Decision.RUN, "") if status is GpuStatus.BROKEN: return ( Decision.FAIL, f"Vulkan GPU is BROKEN ({reason}). A present-but-failing device is a defect, " "not an absence, so this is a failure rather than a skip.", ) # ABSENT if require_gpu: return ( Decision.FAIL, f"--require-gpu was set but {reason}. This lane must run on a GPU, so absence is a " "failure here: it stops an empty GPU run from reporting a false-green pass.", ) return ( Decision.SKIP, f"{reason}; skipping GPU test. Pass --require-gpu to make a missing GPU a failure instead.", )
[docs] def apply_gate(is_vulkan_marked: bool, require_gpu: bool) -> tuple[Decision, str]: """Resolve the run/skip/fail decision for one test (probes the GPU on demand). This is the exact decision the ``conftest`` ``pytest_runtest_setup`` hook applies; the hook only maps the result onto ``pytest.skip`` / ``pytest.fail``. Non-GPU tests (not ``vulkan``-marked) always run. """ if not is_vulkan_marked: return (Decision.RUN, "") status, reason = probe_gpu() return decide(status, reason, require_gpu)
[docs] def ungated_gpu_error_hint(report_text: str, is_vulkan_marked: bool) -> str | None: """Return an actionable hint if a non-GPU-gated test errored from a missing GPU. The auto-marker covers every test that uses a GPU fixture; this is the safety net for a test that builds its own device without gating. On a GPU-less host such a test errors with the absent-device message instead of skipping, which reads as a real bug. We recognise that and tell the author how to gate it. Returns ``None`` for a correctly-gated test or an unrelated failure. """ if is_vulkan_marked: return None if any(m in report_text for m in _ABSENT_MARKERS): return ( "This test failed with a missing-Vulkan-GPU error but is not gated. Use a GPU fixture " "(headless_app / capture / require_vulkan, which auto-marks it 'vulkan') or add " "@pytest.mark.vulkan, so it SKIPS on a GPU-less host (or FAILS under --require-gpu) " "instead of erroring." ) return None