Source code for simvx.graphics.gpu_gate

"""GPU capability gating for the test suite: a single 3-state policy.

The engine's tests fall into three buckets when a Vulkan GPU is involved, and
collapsing "could not run" into "passed" is the one outcome we never allow (a
green-but-empty GPU lane silently hides regressions). The three states:

* **AVAILABLE** -- a usable Vulkan device is present; the test runs.
* **ABSENT** -- no ``vulkan`` binding, or no Vulkan-capable device. By default the
  test **skips** (so contributors and CI hosts without a GPU stay green), unless
  ``--require-gpu`` is passed -- then absence is a **failure**, because that flag
  means the caller (the GPU CI lane, ``tools/run_benchmarks.py``) asked to
  exercise the GPU and an empty run must not look green.
* **BROKEN** -- a device is present but initialisation raises. This is a real
  defect, not an absence, so it is **always a failure**, regardless of
  ``--require-gpu``.

A test that needs a GPU is identified by the ``vulkan`` marker. The graphics
``conftest`` auto-applies that marker to any test requesting a GPU fixture
(:data:`GPU_FIXTURES`), so "uses a GPU fixture" and "is a GPU test" are the same
thing and nothing has to be remembered per test. The runtime guard
(:func:`ungated_gpu_error_hint`) catches the residual case -- a test that builds
its own device without gating -- by recognising the absent-device error.

This module holds the pure decision logic so it is unit-testable and
monkeypatchable without a GPU; the ``conftest`` hooks are thin wrappers over it.
"""

from __future__ import annotations

import enum

__all__ = [
    "GPU_FIXTURES",
    "VULKAN_MARKER",
    "GpuStatus",
    "Decision",
    "probe_gpu",
    "reset_cache",
    "needs_gpu",
    "decide",
    "apply_gate",
    "ungated_gpu_error_hint",
]

# Requesting any of these fixtures means the test drives a real Vulkan device.
GPU_FIXTURES = frozenset({"headless_app", "capture", "require_vulkan"})

VULKAN_MARKER = "vulkan"

# RuntimeError messages raised by ``select_physical_device`` (gpu/device.py) that
# mean "no usable GPU" as opposed to a broken/misconfigured driver (which raises
# something else). Kept in sync with that module.
_ABSENT_MARKERS = (
    "No Vulkan-capable GPU found",
    "No suitable GPU with graphics+present queues found",
)



[docs]
class GpuStatus(enum.Enum):
    AVAILABLE = "available"
    ABSENT = "absent"
    BROKEN = "broken"




[docs]
class Decision(enum.Enum):
    RUN = "run"
    SKIP = "skip"
    FAIL = "fail"



_cached: tuple[GpuStatus, str] | None = None



[docs]
def reset_cache() -> None:
    """Forget the cached probe result (test helper)."""
    global _cached
    _cached = None




[docs]
def probe_gpu() -> tuple[GpuStatus, str]:
    """Detect GPU status once per process. Returns ``(status, human_reason)``."""
    global _cached
    if _cached is None:
        _cached = _detect()
    return _cached



def _detect() -> tuple[GpuStatus, str]:
    try:
        import vulkan  # noqa: F401
    except Exception:
        return (GpuStatus.ABSENT, "the 'vulkan' Python binding is not installed")
    try:
        # The Vulkan device is created inside run_headless(), NOT the App
        # constructor, so the probe must actually render a frame to learn whether
        # a usable device comes up (or how it fails).
        from simvx.core import Node
        from simvx.graphics import App

        App(width=64, height=64, title="gpu-probe", visible=False).run_headless(Node(name="gpu-probe"), frames=1)
    except RuntimeError as e:
        msg = str(e)
        if any(m in msg for m in _ABSENT_MARKERS):
            return (GpuStatus.ABSENT, "no Vulkan-capable GPU is present")
        return (GpuStatus.BROKEN, f"Vulkan device initialisation failed: {msg}")
    except Exception as e:  # any non-RuntimeError during init = broken environment
        return (GpuStatus.BROKEN, f"Vulkan device initialisation raised {type(e).__name__}: {e}")
    return (GpuStatus.AVAILABLE, "")



[docs]
def needs_gpu(fixturenames) -> bool:
    """True if the test requests a GPU fixture (so it should be ``vulkan``-marked)."""
    return bool(GPU_FIXTURES.intersection(fixturenames))




[docs]
def decide(status: GpuStatus, reason: str, require_gpu: bool) -> tuple[Decision, str]:
    """Map ``(probe status, --require-gpu)`` to a run/skip/fail decision + message."""
    if status is GpuStatus.AVAILABLE:
        return (Decision.RUN, "")
    if status is GpuStatus.BROKEN:
        return (
            Decision.FAIL,
            f"Vulkan GPU is BROKEN ({reason}). A present-but-failing device is a defect, "
            "not an absence, so this is a failure rather than a skip.",
        )
    # ABSENT
    if require_gpu:
        return (
            Decision.FAIL,
            f"--require-gpu was set but {reason}. This lane must run on a GPU, so absence is a "
            "failure here: it stops an empty GPU run from reporting a false-green pass.",
        )
    return (
        Decision.SKIP,
        f"{reason}; skipping GPU test. Pass --require-gpu to make a missing GPU a failure instead.",
    )




[docs]
def apply_gate(is_vulkan_marked: bool, require_gpu: bool) -> tuple[Decision, str]:
    """Resolve the run/skip/fail decision for one test (probes the GPU on demand).

    This is the exact decision the ``conftest`` ``pytest_runtest_setup`` hook
    applies; the hook only maps the result onto ``pytest.skip`` / ``pytest.fail``.
    Non-GPU tests (not ``vulkan``-marked) always run.
    """
    if not is_vulkan_marked:
        return (Decision.RUN, "")
    status, reason = probe_gpu()
    return decide(status, reason, require_gpu)




[docs]
def ungated_gpu_error_hint(report_text: str, is_vulkan_marked: bool) -> str | None:
    """Return an actionable hint if a non-GPU-gated test errored from a missing GPU.

    The auto-marker covers every test that uses a GPU fixture; this is the safety
    net for a test that builds its own device without gating. On a GPU-less host
    such a test errors with the absent-device message instead of skipping, which
    reads as a real bug. We recognise that and tell the author how to gate it.
    Returns ``None`` for a correctly-gated test or an unrelated failure.
    """
    if is_vulkan_marked:
        return None
    if any(m in report_text for m in _ABSENT_MARKERS):
        return (
            "This test failed with a missing-Vulkan-GPU error but is not gated. Use a GPU fixture "
            "(headless_app / capture / require_vulkan, which auto-marks it 'vulkan') or add "
            "@pytest.mark.vulkan, so it SKIPS on a GPU-less host (or FAILS under --require-gpu) "
            "instead of erroring."
        )
    return None