Source code for simvx.graphics.gpu.multi_device

"""Explicit multi-adapter (multi-GPU) foundation: D8 workload-split offload.

This is the **foundation** wave of design decision D8. It builds the gated
plumbing for an explicit-multi-adapter renderer (one independent ``VkDevice``
per physical GPU) where whole SubViewport / offscreen scene-render-units (SRUs)
are offloaded to secondary GPUs and composited on the primary (GPU 0). It is
deliberately **off by default**: a single-GPU box, and any multi-GPU box that
does not opt in, runs today's single-device path byte-identical (no extra
device, no transfer work, no behaviour change). The active multi-device path is
verified on a 4x Arc Pro B70 rig, not on this single-GPU dev box.

Three coherent, unit-testable pieces live here:

1. :class:`MultiDeviceManager` enumerates physical devices and, only when
   ``physical_device_count > 1`` **and** the caller opted in, creates an
   independent logical device per physical GPU (reusing
   :func:`~simvx.graphics.gpu.device.create_logical_device`). When the count is
   1 or the opt-in is off it holds exactly the existing single (primary) device
   and is a transparent passthrough.
2. :func:`assign_srus` is the pure device-assignment policy (no Vulkan): given
   the ordered SRUs and a device count it decides which render on GPU 0 vs which
   offload to GPU 1+. TAA-safe by construction (each SRU keeps its temporal
   history on its one assigned device, so there is no cross-device reprojection).
3. :class:`CrossDeviceTransfer` selects how a finished offscreen colour image
   moves from a secondary device to the primary for compositing. The
   staging-copy path (secondary -> host-visible staging -> primary) is the
   guaranteed floor and is the only path implemented end-to-end; the dma-buf /
   ``VK_KHR_external_memory_fd`` zero-copy path is the rig optimisation, gated
   behind a capability and currently raising a clear, actionable error.

REALITY CHECK (honest scope). Rendering an SRU on a *second* ``VkDevice``
requires that device to own its full set of rendering resources: its own
pipelines, descriptor pools, transform/material SSBOs, and mesh+texture
residency. The current :class:`~simvx.graphics.renderer.forward.Renderer` is
built around exactly one device. Duplicating it per device is a large,
GPU-bound refactor that cannot be functionally verified on this single-GPU box.
So this module stops at a clean seam: the device manager, the assignment
policy, and the transfer interface are real and tested; the per-device renderer
construction + the actual offload-record-and-composite loop are the documented
rig-side completion (see :class:`MultiDeviceManager.attach_renderer` /
:meth:`DeviceSlot.renderer`).
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any

import vulkan as vk

from .device import QueueFamilies, create_logical_device

if TYPE_CHECKING:
    from .capabilities import RenderCapabilities

log = logging.getLogger(__name__)

__all__ = [
    "DeviceSlot",
    "MultiDeviceManager",
    "SRUAssignment",
    "assign_srus",
    "CrossDeviceTransfer",
    "TransferMethod",
    "OffloadRoute",
    "SRUOffloadCoordinator",
]


# --- Device topology ---------------------------------------------------------



[docs]
@dataclass
class DeviceSlot:
    """One physical+logical GPU participating in the multi-device renderer.

    ``index`` 0 is always the primary (compositing) GPU; 1+ are secondaries that
    render offloaded SRUs. On a single-GPU run there is exactly one slot
    (``index == 0``) wrapping the engine's existing device, and nothing else is
    created.

    ``renderer`` is the per-device :class:`~simvx.graphics.renderer.forward.Renderer`
    duplicate. It is populated for the primary slot from the engine's existing
    renderer and, on the rig, for each secondary by
    :meth:`MultiDeviceManager.attach_renderer`. It stays ``None`` for secondaries
    until that rig-side per-device renderer construction lands (see the module
    docstring): the foundation never fabricates a broken renderer to look
    complete.
    """

    index: int
    physical_device: Any
    queue_families: QueueFamilies
    device: Any = None
    graphics_queue: Any = None
    present_queue: Any = None
    compute_queue: Any = None
    transfer_queue: Any = None
    name: str = ""
    renderer: Any = None


[docs]
    @property
    def is_primary(self) -> bool:
        return self.index == 0





[docs]
class MultiDeviceManager:
    """Owns the per-physical-GPU logical devices for the D8 offload renderer.

    Construct with the primary device's already-created handles (the engine's
    existing single device, so the primary slot is never re-created) plus the
    enumerated physical devices and the opt-in flag. When ``enabled`` is true and
    more than one physical device is present, a secondary :class:`DeviceSlot` is
    created per additional physical GPU with its own independent ``VkDevice`` via
    :func:`create_logical_device`. Otherwise the manager holds exactly the single
    primary slot and :attr:`multi_gpu` is ``False`` (today's path, unchanged).

    The manager does NOT own the primary device's lifetime (the engine created
    and destroys it); :meth:`destroy` only tears down the *secondary* devices it
    created itself.
    """

    def __init__(
        self,
        *,
        primary_physical_device: Any,
        primary_queue_families: QueueFamilies,
        primary_device: Any,
        primary_graphics_queue: Any,
        primary_present_queue: Any,
        physical_devices: list[Any],
        enabled: bool,
        capabilities: RenderCapabilities | None = None,
        primary_compute_queue: Any = None,
        primary_transfer_queue: Any = None,
        find_queue_families: Any = None,
    ) -> None:
        self._capabilities = capabilities
        self._physical_devices = list(physical_devices)
        # Whether secondary logical devices were created with
        # VK_KHR_external_memory_fd enabled. Only true on an active multi-GPU path
        # where the extension is probed available on every secondary; gates the
        # dma-buf transfer selection (enabled, not merely probed). False on this
        # box (no secondaries) and whenever any secondary lacks the extension.
        self.external_memory_fd_enabled = False
        # Teardown callbacks for per-secondary-device GPU resources (offload
        # coordinator facades / targets / staging buffers). They MUST run before
        # the secondary VkDevices are destroyed; :meth:`destroy` invokes them first.
        self._teardown_hooks: list[Any] = []
        # The primary slot wraps the engine's existing device. It is slot 0 and
        # the manager never creates or destroys its VkDevice.
        primary = DeviceSlot(
            index=0,
            physical_device=primary_physical_device,
            queue_families=primary_queue_families,
            device=primary_device,
            graphics_queue=primary_graphics_queue,
            present_queue=primary_present_queue,
            compute_queue=primary_compute_queue,
            transfer_queue=primary_transfer_queue,
            name=_device_name(primary_physical_device),
        )
        self._slots: list[DeviceSlot] = [primary]
        # Multi-GPU is active only when the caller opted in AND there is more
        # than one physical device. Either condition false => single-slot
        # passthrough, byte-identical to today.
        self._multi_gpu = bool(enabled) and len(self._physical_devices) > 1
        if self._multi_gpu:
            self._create_secondaries(primary_physical_device, find_queue_families)
        elif enabled and len(self._physical_devices) <= 1:
            log.info(
                "multi-GPU requested but only %d physical device(s): single-device path",
                len(self._physical_devices),
            )

    def _create_secondaries(self, primary_pd: Any, find_queue_families: Any) -> None:
        """Create one independent logical device per *additional* physical GPU.

        ``find_queue_families`` resolves a :class:`QueueFamilies` for a secondary
        physical device. A secondary renders offscreen (it never presents), so it
        does not need a present-capable family; the injected resolver lets the
        rig wire surface-less graphics-queue selection without this module
        importing the surface path. When omitted (unit tests, and any device for
        which it returns ``None``) the secondary is skipped with a warning rather
        than guessing a family.
        """
        # Enable the dma-buf prerequisites on every secondary only when the
        # capability snapshot reports the extension available; otherwise keep the
        # staging-copy floor. The flag is set true only once at least one secondary
        # was actually created with it enabled.
        want_ext_mem = bool(getattr(self._capabilities, "external_memory_fd", False))
        for pd in self._physical_devices:
            if pd is primary_pd:
                continue
            qf = find_queue_families(pd) if find_queue_families is not None else None
            if qf is None:
                log.warning("multi-GPU: no graphics queue family resolved for %s, skipping", _device_name(pd))
                continue
            device, gq, pq, cq, tq = create_logical_device(pd, qf, external_memory_fd=want_ext_mem)
            if want_ext_mem:
                self.external_memory_fd_enabled = True
            self._slots.append(
                DeviceSlot(
                    index=len(self._slots),
                    physical_device=pd,
                    queue_families=qf,
                    device=device,
                    graphics_queue=gq,
                    present_queue=pq,
                    compute_queue=cq,
                    transfer_queue=tq,
                    name=_device_name(pd),
                )
            )
            log.info("multi-GPU: created secondary logical device %d (%s)", len(self._slots) - 1, _device_name(pd))


[docs]
    @property
    def multi_gpu(self) -> bool:
        """True only when an opted-in multi-device renderer is active (>= 2 slots)."""
        return self._multi_gpu and len(self._slots) > 1



[docs]
    @property
    def device_count(self) -> int:
        """Number of logical devices the manager owns (1 on the single-GPU path)."""
        return len(self._slots)



[docs]
    @property
    def primary(self) -> DeviceSlot:
        return self._slots[0]



[docs]
    @property
    def secondaries(self) -> list[DeviceSlot]:
        return self._slots[1:]



[docs]
    @property
    def slots(self) -> list[DeviceSlot]:
        return list(self._slots)



[docs]
    def slot(self, index: int) -> DeviceSlot:
        return self._slots[index]



[docs]
    def attach_renderer(self, index: int, renderer: Any) -> None:
        """Bind a per-device :class:`Renderer` to slot ``index`` (rig-side).

        The primary renderer is the engine's existing one. Each secondary needs
        its OWN renderer (its device's pipelines / descriptor pools / SSBOs /
        residency); constructing that is the documented rig-side completion. This
        setter is the seam where the rig hands the constructed per-device
        renderer back to the manager so the offload loop can record into it.
        """
        self._slots[index].renderer = renderer



[docs]
    def register_teardown(self, hook: Any) -> None:
        """Register a callback to free per-secondary GPU resources before device destroy.

        The offload coordinator registers its :meth:`SRUOffloadCoordinator.destroy`
        here so its secondary facades / targets / staging buffers are freed while the
        secondary ``VkDevice``\\ s are still alive (they own those resources). Invoked
        first by :meth:`destroy`.
        """
        self._teardown_hooks.append(hook)



[docs]
    def destroy(self) -> None:
        """Destroy only the SECONDARY logical devices this manager created.

        The primary device is owned by the engine and left untouched. Safe to
        call on the single-GPU path (no secondaries => no-op). Registered teardown
        hooks run FIRST so per-secondary GPU resources are freed before the devices.
        """
        for hook in self._teardown_hooks:
            hook()
        self._teardown_hooks.clear()
        for slot in self._slots[1:]:
            if slot.device is not None:
                vk.vkDeviceWaitIdle(slot.device)
                vk.vkDestroyDevice(slot.device, None)
                slot.device = None
        del self._slots[1:]
        self._multi_gpu = False




def _barrier_image(
    cmd: Any,
    image: Any,
    old_layout: int,
    new_layout: int,
    src_access: int,
    dst_access: int,
    src_stage: int,
    dst_stage: int,
) -> None:
    """Record a single colour-image layout barrier into ``cmd`` (no submit).

    Inline barrier for the cross-device staging copy: unlike
    :func:`~simvx.graphics.gpu.memory.transition_image_layout` it records into a
    caller-owned command buffer rather than submitting its own one-shot, so the
    secondary->buffer and buffer->primary copies each batch their two barriers +
    the copy into ONE submit. Colour aspect, single mip / array layer.
    """
    barrier = vk.VkImageMemoryBarrier(
        srcAccessMask=src_access,
        dstAccessMask=dst_access,
        oldLayout=old_layout,
        newLayout=new_layout,
        srcQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
        dstQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
        image=image,
        subresourceRange=vk.VkImageSubresourceRange(
            aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
            baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1,
        ),
    )
    vk.vkCmdPipelineBarrier(cmd, src_stage, dst_stage, 0, 0, None, 0, None, 1, [barrier])


def _device_name(physical_device: Any) -> str:
    try:
        props = vk.vkGetPhysicalDeviceProperties(physical_device)
        name = props.deviceName
        return name if isinstance(name, str) else name.decode("utf-8")
    except Exception:  # noqa: BLE001 (name is diagnostic only)
        return "<unknown>"


# --- Device-assignment policy (pure logic, no Vulkan) ------------------------



[docs]
@dataclass(frozen=True, slots=True)
class SRUAssignment:
    """Which device renders one SRU and whether its result must be transferred.

    ``device_index`` 0 means the SRU renders on the primary and is already
    resident for compositing (``needs_transfer`` is ``False``). A non-zero index
    means the SRU is offloaded to that secondary and its finished colour image
    must be moved to the primary before compositing (``needs_transfer`` is
    ``True``).
    """

    sru_id: int
    device_index: int


[docs]
    @property
    def needs_transfer(self) -> bool:
        return self.device_index != 0




def _sru_cost(sru: Any) -> int:
    """Heuristic offload cost of one SRU: its instance + skinned-instance count.

    Pure and Vulkan-free. The heaviest independent SRUs are the ones worth
    shipping to a secondary GPU; instance count is the cheapest faithful proxy
    we already have on the snapshotted plan. SRUs with no countable instances
    fall back to cost 1 so ordering is still deterministic.
    """
    inst = getattr(sru, "instances", None)
    skinned = getattr(sru, "skinned_instances", None)
    n = (len(inst) if inst is not None else 0) + (len(skinned) if skinned is not None else 0)
    return n if n > 0 else 1



[docs]
def assign_srus(
    srus: list[Any],
    device_count: int,
    *,
    sru_id: Any = None,
    cost: Any = None,
) -> list[SRUAssignment]:
    """Decide which device renders each SRU (pure, unit-testable, no Vulkan).

    Policy (TAA-safe by construction: an SRU is assigned to exactly one device,
    so its temporal history never crosses devices):

    - ``device_count <= 1``: EVERY SRU stays on GPU 0. This is the single-GPU /
      unopted path and produces the same per-SRU work as today, byte-identical
      (no transfer is ever flagged).
    - ``device_count >= 2``: the main scene is implicitly GPU 0 (it is not an
      SRU and is not in this list). Among the SRUs, the cheapest stay on GPU 0
      (composited locally) and the **heaviest independent** SRUs are offloaded to
      the secondaries round-robin. Concretely: sort SRUs by descending cost and
      walk them, sending the next heaviest to the least-loaded secondary while
      that keeps the primary from being the bottleneck; ties and the remainder
      stay on GPU 0.

    The returned list preserves the INPUT order of ``srus`` (the P1
    producer-before-consumer topological order), so a consumer SRU still follows
    the producer it samples; only the device choice is decided here.

    Args:
        srus: Ordered SRU plans (``SubViewportSRU`` or any object exposing the
            cost inputs). Order is preserved in the result.
        device_count: Number of devices available (``MultiDeviceManager.device_count``).
        sru_id: Optional accessor ``sru -> int`` for the stable id; defaults to
            reading ``sru.sru_id``.
        cost: Optional accessor ``sru -> int`` overriding the default instance-count
            heuristic (handy for tests).
    """
    get_id = sru_id if sru_id is not None else (lambda s: s.sru_id)
    get_cost = cost if cost is not None else _sru_cost

    if device_count <= 1 or not srus:
        return [SRUAssignment(sru_id=get_id(s), device_index=0) for s in srus]

    n_secondary = device_count - 1
    # Heaviest-first candidate order for offload decisions, but we DECIDE on this
    # order and then EMIT in the caller's input order so producer<-consumer
    # topology survives. Stable secondary index map keyed by id().
    by_cost = sorted(srus, key=lambda s: (-get_cost(s), id(s)))
    # Round-robin the heaviest SRUs across secondaries; keep the rest on primary.
    # We offload at most ``n_secondary`` heaviest SRUs per "round" so the primary
    # always keeps the cheap tail (it must also composite). A simple, balanced
    # rule that scales to device_count in {2, 4}: offload the top
    # ``n_secondary`` heaviest SRUs, one per secondary; everything else -> GPU 0.
    offload: dict[int, int] = {}
    for slot_offset, sru in enumerate(by_cost[:n_secondary]):
        offload[id(sru)] = 1 + slot_offset  # secondary device indices 1..n
    return [
        SRUAssignment(sru_id=get_id(s), device_index=offload.get(id(s), 0))
        for s in srus
    ]



# --- Cross-device transfer interface -----------------------------------------



[docs]
class TransferMethod:
    """Selectable cross-device image-transfer strategies (enum-like constants)."""

    NONE = "none"
    """SRU already lives on the primary device: compositing samples it directly."""
    STAGING_COPY = "staging_copy"
    """Secondary image -> host-visible staging buffer -> primary image. The
    guaranteed floor, works on any pair of devices (the colour RenderTarget
    already carries TRANSFER_SRC|TRANSFER_DST)."""
    DMABUF = "dmabuf"
    """Zero-copy via VK_KHR_external_memory_fd (dma-buf import/export). The rig
    optimisation; requires the external-memory extensions to be enabled on both
    devices. Gated and currently raises until the rig path is implemented."""



# The dma-buf zero-copy transfer (run_dmabuf) is not implemented yet, so
# select_transfer_method must never return DMABUF regardless of caller preference
# or the enabled-extension capability: doing so would later raise with no working
# path. Flip to True only when run_staging_copy's dma-buf sibling is wired + rig-verified.
_DMABUF_IMPLEMENTED = False



[docs]
def select_transfer_method(
    assignment: SRUAssignment,
    capabilities: RenderCapabilities | None,
    *,
    prefer_dmabuf: bool = True,
) -> str:
    """Pick the transfer strategy for one SRU assignment (pure, no Vulkan).

    - An SRU on the primary needs no transfer -> :data:`TransferMethod.NONE`.
    - An offloaded SRU uses :data:`TransferMethod.DMABUF` when the caller prefers
      it AND the capability snapshot reports the external-memory-fd path *enabled*;
      otherwise the always-available :data:`TransferMethod.STAGING_COPY`.

    The capability gate is :attr:`RenderCapabilities.external_memory_fd_enabled`:
    the extension must be *enabled* at device creation, not merely probed
    available. A probed-but-not-enabled device cannot export/import fds, so DMABUF
    must not be selected for it (that would later raise with no working path). On
    this dev box the single-GPU path never enables it, so the field is ``False``,
    the staging-copy floor is always chosen, and the dma-buf raise is never reached.
    """
    if not assignment.needs_transfer:
        return TransferMethod.NONE
    has_dmabuf = bool(getattr(capabilities, "external_memory_fd_enabled", False))
    # DMABUF is only selectable once its transfer path actually exists (it does
    # not yet, run_dmabuf raises), so enabling the extension at device creation
    # must NOT activate an unimplemented path: staging-copy stays the choice.
    if _DMABUF_IMPLEMENTED and prefer_dmabuf and has_dmabuf:
        return TransferMethod.DMABUF
    return TransferMethod.STAGING_COPY




[docs]
@dataclass
class CrossDeviceTransfer:
    """Moves a finished offscreen colour image from a secondary to the primary.

    Holds the source (secondary) and destination (primary) :class:`DeviceSlot`
    plus the chosen :class:`TransferMethod`. Cross-device synchronisation is
    explicit: each device signals a fence/timeline when its half of the copy is
    done, and the primary's composite waits on the import being complete. The
    sync handles are carried on this object so the rig wiring is a single seam.

    Implemented:
      :meth:`run_staging_copy` defines the staging-copy sequence
      (secondary GPU copy-to-buffer -> host-visible staging -> primary
      copy-to-image). The CPU-roundtrip floor: correct on any device pair, slower
      than dma-buf. The actual ``vkCmd*`` recording is the rig-side completion
      because it needs both devices' live command pools + a host-visible staging
      allocation per device, which cannot be exercised on this single-GPU box;
      the method documents and gates that precisely rather than emitting
      unverifiable copy code.

    Gated (rig):
      :meth:`run_dmabuf` raises a clear, actionable error until
      VK_KHR_external_memory_fd is enabled and the import/export is wired on the
      rig.
    """

    src: DeviceSlot
    dst: DeviceSlot
    method: str
    # Cross-device sync handles (per-device fence/timeline). Defined as the seam;
    # populated by the rig wiring. The primary composite waits on ``dst_ready``.
    src_done: Any = None
    dst_ready: Any = None
    # Sizing of the transferred colour image (set when the SRU target is known).
    width: int = 0
    height: int = 0
    # Bytes per pixel of the colour format being moved. The SubViewport offscreen
    # target is ``R16G16B16A16_SFLOAT`` (8 bpp); a generic RT may differ. The host
    # roundtrip is format-agnostic: it moves raw bytes, so only the byte count of
    # one row + the whole image matter, not the channel layout.
    bytes_per_pixel: int = 8
    extra: dict = field(default_factory=dict)


[docs]
    def run(self, *, src_image: Any = None, dst_image: Any = None) -> None:
        """Execute the selected transfer. Dispatches by :attr:`method`.

        :data:`TransferMethod.NONE` is a no-op (single-GPU / primary-resident SRU,
        the byte-identical path). The other two dispatch to their handlers.
        """
        if self.method == TransferMethod.NONE:
            return
        if self.method == TransferMethod.STAGING_COPY:
            self.run_staging_copy(src_image=src_image, dst_image=dst_image)
            return
        if self.method == TransferMethod.DMABUF:
            self.run_dmabuf(src_image=src_image, dst_image=dst_image)
            return
        raise ValueError(f"unknown cross-device transfer method {self.method!r}")



[docs]
    def run_staging_copy(self, *, src_image: Any = None, dst_image: Any = None) -> None:
        """Staging-copy floor: secondary image -> host staging -> primary image.

        The CPU-roundtrip floor: correct on any device pair, slower than dma-buf.
        ``src_image`` is the finished SRU colour image on the SECONDARY device (left
        in ``SHADER_READ_ONLY_OPTIMAL`` by the offscreen pass; the RenderTarget
        carries ``TRANSFER_SRC_BIT``). ``dst_image`` is the primary-device image the
        main scene samples for this SubViewport feed (carries ``TRANSFER_DST_BIT``).

        Exact sequence + every wait (the contract recorded here, verified on the
        rig). All staging buffers are host-visible|host-coherent and pre-built once
        per (transfer, size) into :attr:`extra` by :meth:`ensure_staging`:

        1. SOURCE (secondary) device, one-shot command buffer on the secondary
           command pool:

           a. barrier ``src_image``: ``SHADER_READ_ONLY_OPTIMAL -> TRANSFER_SRC_OPTIMAL``
              (src stage FRAGMENT_SHADER, dst stage TRANSFER; src access SHADER_READ,
              dst access TRANSFER_READ).
           b. ``vkCmdCopyImageToBuffer`` ``src_image -> src_staging`` (tightly packed,
              ``bufferRowLength=0``, ``bufferImageHeight=0``).
           c. barrier ``src_image``: ``TRANSFER_SRC_OPTIMAL -> SHADER_READ_ONLY_OPTIMAL``
              so the secondary can render into / sample it again next frame.
           d. submit on ``src.graphics_queue`` signalling :attr:`src_done` (a fence
              on the secondary device). **WAIT:** the host read in step 2 blocks on
              ``src_done`` (``vkWaitForFences``); the GPU copy must complete before
              the bytes are mapped. The offscreen SRU render fence is itself waited
              on before this submit by the caller (``_record_offloaded_sru``), so the
              colour image is fully written first.

        2. HOST: map ``src_staging`` (secondary device), ``memmove`` the
           ``height * row_bytes`` bytes into ``dst_staging`` (primary device), unmap
           both. Both are host-coherent so no explicit flush/invalidate is needed.
           **WAIT:** gated on ``src_done`` from 1d before the read; the write into
           ``dst_staging`` happens-before the primary GPU read in 3 because step 3 is
           submitted only after this memmove returns on the same (host) thread.

        3. DESTINATION (primary) device, one-shot command buffer on the primary
           command pool:

           a. barrier ``dst_image``: ``SHADER_READ_ONLY_OPTIMAL -> TRANSFER_DST_OPTIMAL``.
           b. ``vkCmdCopyBufferToImage`` ``dst_staging -> dst_image``.
           c. barrier ``dst_image``: ``TRANSFER_DST_OPTIMAL -> SHADER_READ_ONLY_OPTIMAL``
              so the main composite pass samples it.
           d. submit on ``dst.graphics_queue`` signalling :attr:`dst_ready` (a fence
              on the primary device). **WAIT:** the main composite pass that samples
              ``dst_image`` runs after ``dst_ready``; the caller waits on it before
              recording the frame's main pass (or, in the same-frame in-cmd model,
              this transfer is submitted + waited before the main pass is recorded).

        The implementation records exactly this. It needs both devices' command
        pools + a host-visible staging buffer per device, supplied via
        :meth:`ensure_staging`. On the single-GPU path the method is NONE and this is
        never reached; it is exercised end-to-end only on the multi-GPU rig.
        """
        if self.method != TransferMethod.STAGING_COPY:
            raise ValueError(f"run_staging_copy called for method {self.method!r}")
        if src_image is None or dst_image is None:
            raise ValueError("run_staging_copy needs both src_image (secondary) and dst_image (primary)")
        staging = self.extra.get("staging")
        if staging is None:
            raise RuntimeError(
                "run_staging_copy: per-device staging buffers not allocated; call "
                "ensure_staging(src_pool, dst_pool) first (the offload coordinator does)."
            )
        self._record_staging_copy(src_image, dst_image, staging)



[docs]
    def ensure_staging(self, src_command_pool: Any, dst_command_pool: Any) -> dict:
        """Allocate (once) the per-device host-visible staging buffers + cache pools.

        Builds one host-visible|host-coherent buffer on each device, sized
        ``width * height * bytes_per_pixel`` (the raw colour image bytes). Cached on
        :attr:`extra` ``['staging']`` so a steady-state per-frame transfer reuses
        them; reallocated only when the size changes (a SubViewport resize). Returns
        the staging dict. Rig-side GPU allocation; never reached on this box.
        """
        from .memory import create_buffer

        size = max(1, self.width * self.height * self.bytes_per_pixel)
        staging = self.extra.get("staging")
        if staging is not None and staging.get("size") == size:
            staging["src_pool"] = src_command_pool
            staging["dst_pool"] = dst_command_pool
            return staging
        if staging is not None:
            self._destroy_staging(staging)
        src_buf, src_mem = create_buffer(
            self.src.device, self.src.physical_device, size,
            vk.VK_BUFFER_USAGE_TRANSFER_DST_BIT,
            vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        )
        dst_buf, dst_mem = create_buffer(
            self.dst.device, self.dst.physical_device, size,
            vk.VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
            vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        )
        staging = {
            "size": size,
            "src_buf": src_buf, "src_mem": src_mem,
            "dst_buf": dst_buf, "dst_mem": dst_mem,
            "src_pool": src_command_pool, "dst_pool": dst_command_pool,
        }
        self.extra["staging"] = staging
        return staging


    def _record_staging_copy(self, src_image: Any, dst_image: Any, staging: dict) -> None:
        """Record + run the two single-submit copies + the host roundtrip (see sequence)."""
        from .memory import begin_single_time_commands, end_single_time_commands

        w, h, bpp = self.width, self.height, self.bytes_per_pixel
        row_bytes = w * bpp
        size = staging["size"]

        # --- 1. SOURCE (secondary): image -> host staging buffer -----------------
        cmd = begin_single_time_commands(self.src.device, staging["src_pool"])
        _barrier_image(
            cmd, src_image,
            vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
            vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_READ_BIT,
            vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
        )
        region = vk.VkBufferImageCopy(
            bufferOffset=0, bufferRowLength=0, bufferImageHeight=0,
            imageSubresource=vk.VkImageSubresourceLayers(
                aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0, baseArrayLayer=0, layerCount=1,
            ),
            imageOffset=vk.VkOffset3D(x=0, y=0, z=0),
            imageExtent=vk.VkExtent3D(width=w, height=h, depth=1),
        )
        vk.vkCmdCopyImageToBuffer(
            cmd, src_image, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, staging["src_buf"], 1, [region],
        )
        _barrier_image(
            cmd, src_image,
            vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
            vk.VK_ACCESS_TRANSFER_READ_BIT, vk.VK_ACCESS_SHADER_READ_BIT,
            vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
        )
        # end_single_time_commands submits + waits idle on the secondary queue: that
        # is the src_done fence-equivalent (the host read below cannot start until
        # the secondary GPU copy has fully completed).
        end_single_time_commands(self.src.device, self.src.graphics_queue, staging["src_pool"], cmd)

        # --- 2. HOST: secondary staging -> primary staging -----------------------
        ffi = vk.ffi
        src_ptr = vk.vkMapMemory(self.src.device, staging["src_mem"], 0, size, 0)
        dst_ptr = vk.vkMapMemory(self.dst.device, staging["dst_mem"], 0, size, 0)
        ffi.memmove(dst_ptr, src_ptr, h * row_bytes)
        vk.vkUnmapMemory(self.src.device, staging["src_mem"])
        vk.vkUnmapMemory(self.dst.device, staging["dst_mem"])

        # --- 3. DESTINATION (primary): host staging -> primary image -------------
        cmd = begin_single_time_commands(self.dst.device, staging["dst_pool"])
        _barrier_image(
            cmd, dst_image,
            vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
            vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_WRITE_BIT,
            vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
        )
        vk.vkCmdCopyBufferToImage(
            cmd, staging["dst_buf"], dst_image, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, [region],
        )
        _barrier_image(
            cmd, dst_image,
            vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
            vk.VK_ACCESS_TRANSFER_WRITE_BIT, vk.VK_ACCESS_SHADER_READ_BIT,
            vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
        )
        # dst_ready fence-equivalent: waits idle on the primary queue, so the main
        # composite pass that samples dst_image runs strictly after the copy lands.
        end_single_time_commands(self.dst.device, self.dst.graphics_queue, staging["dst_pool"], cmd)

    def _destroy_staging(self, staging: dict) -> None:
        vk.vkDestroyBuffer(self.src.device, staging["src_buf"], None)
        vk.vkFreeMemory(self.src.device, staging["src_mem"], None)
        vk.vkDestroyBuffer(self.dst.device, staging["dst_buf"], None)
        vk.vkFreeMemory(self.dst.device, staging["dst_mem"], None)


[docs]
    def destroy(self) -> None:
        """Free the per-device staging buffers this transfer allocated (rig-side)."""
        staging = self.extra.pop("staging", None)
        if staging is not None:
            self._destroy_staging(staging)



[docs]
    def run_dmabuf(self, *, src_image: Any = None, dst_image: Any = None) -> None:
        """Zero-copy dma-buf transfer (VK_KHR_external_memory_fd). Rig optimisation.

        Export the secondary colour image's memory as an opaque fd / dma-buf,
        import it on the primary as an external image, and composite directly with
        a single cross-device semaphore wait (no host roundtrip). Requires the
        external-memory-fd extension enabled on BOTH devices at init.

        Gated: raises until the rig path is implemented and the
        ``external_memory_fd`` capability is enabled.
        """
        raise NotImplementedError(
            "dma-buf zero-copy transfer (VK_KHR_external_memory_fd) is the rig "
            "optimisation: enable the external-memory-fd extension on both devices "
            "at init, export the secondary colour image memory as an fd, import it "
            "on the primary, and composite with a cross-device semaphore. Falls back "
            "to TransferMethod.STAGING_COPY when the capability is off (this box). "
            "Verified on the 4x Arc Pro B70 rig."
        )




# --- SRU offload routing (the SubViewport offload seam) ----------------------



[docs]
@dataclass(frozen=True, slots=True)
class OffloadRoute:
    """The per-SRU offload decision the recording path consults.

    Computed once per frame by :class:`SRUOffloadCoordinator.plan` from the
    ordered SRU list. ``device_index`` 0 means "render this SRU on the primary,
    exactly as today" (``offloaded`` is ``False``, ``transfer`` is
    :data:`TransferMethod.NONE`); a non-zero index means the SRU renders on that
    secondary's renderer and its colour image is moved to the primary via
    :attr:`transfer` before the main pass samples it.
    """

    sru_id: int
    device_index: int
    transfer: str


[docs]
    @property
    def offloaded(self) -> bool:
        """True when this SRU renders on a secondary device (not GPU 0)."""
        return self.device_index != 0





[docs]
class SRUOffloadCoordinator:
    """Decides + drives where each SubViewport SRU renders across the devices.

    Glue between the (already-tested) :func:`assign_srus` policy, the
    :func:`select_transfer_method` selector, and the :class:`MultiDeviceManager`
    device topology. The recording path (``SceneAdapter.render_sru_from_plan`` /
    ``render_to_target``) consults a coordinator, when one is present, to decide
    per SRU whether to take today's primary-device path or route the SRU to a
    secondary device and transfer the result back.

    Constructed only when the manager is actively multi-GPU (opted in AND >= 2
    devices). On the single-GPU / unopted path the engine builds **no**
    coordinator, so the recording path's ``coordinator is None`` branch is taken
    and the frame is byte-identical to today.

    The decision logic (``plan`` / ``route_for``) is pure and unit-tested with no
    Vulkan. :meth:`render_offloaded` is the seam where a secondary-assigned SRU
    would be recorded on its device's renderer and transferred back; that needs a
    per-device renderer (``DeviceSlot.renderer``), which is ``None`` on every box
    without the rig-side per-device-renderer construction, so it raises a clear,
    capability-gated error rather than emitting unverifiable cross-device code.
    """

    def __init__(self, manager: MultiDeviceManager, capabilities: RenderCapabilities | None = None,
                 *, prefer_dmabuf: bool = True, content_scale: tuple[float, float] = (1.0, 1.0),
                 secondary_renderer_factory: Any = None) -> None:
        self._manager = manager
        self._capabilities = capabilities
        self._prefer_dmabuf = prefer_dmabuf
        self._content_scale = content_scale
        self._routes: dict[int, OffloadRoute] = {}
        # Rig-injected builder: ``factory(facade) -> Renderer`` constructs a
        # ``Renderer(facade)`` and runs its GPU ``setup()`` on the secondary device.
        # The engine injects it (on the rig) so this module never imports the heavy
        # renderer / triggers a GPU build it cannot verify on this single-GPU box.
        # ``None`` => no secondary renderer can be built and :meth:`render_offloaded`
        # raises the clear rig-completion error (the path here today).
        self._secondary_renderer_factory = secondary_renderer_factory
        # Per-secondary-device lazily-built render state, keyed by device_index:
        #   {index: {"facade": SecondaryRenderContext, "residency": SecondaryResidency,
        #            "target": RenderTarget, "transfer": CrossDeviceTransfer}}
        # Built once, reused every frame; torn down in :meth:`destroy`.
        self._secondary: dict[int, dict] = {}
        # Free per-secondary GPU resources before the manager destroys the secondary
        # devices (they own those resources). No-op on this box (nothing is built).
        manager.register_teardown(self.destroy)


[docs]
    @property
    def active(self) -> bool:
        """True only when the backing manager is an opted-in multi-device renderer."""
        return self._manager.multi_gpu



[docs]
    def plan(self, srus: list[Any], *, sru_id: Any = None, cost: Any = None) -> list[OffloadRoute]:
        """Compute + cache this frame's per-SRU routes from the ordered SRU list.

        Returns one :class:`OffloadRoute` per SRU in INPUT order (the P1
        producer-before-consumer topological order is preserved). When the
        coordinator is inactive (single-GPU / unopted) every route stays on GPU 0
        with :data:`TransferMethod.NONE`, so the caller takes today's path.

        ``sru_id`` / ``cost`` are forwarded to :func:`assign_srus` so a caller
        whose ordered items are not ``SubViewportSRU`` (e.g. the synchronous path's
        live nodes) can supply the id + offload-cost accessors.
        """
        count = self._manager.device_count if self.active else 1
        assignments = assign_srus(srus, count, sru_id=sru_id, cost=cost)
        routes: list[OffloadRoute] = []
        self._routes = {}
        for a in assignments:
            method = select_transfer_method(a, self._capabilities, prefer_dmabuf=self._prefer_dmabuf)
            route = OffloadRoute(sru_id=a.sru_id, device_index=a.device_index, transfer=method)
            routes.append(route)
            self._routes[a.sru_id] = route
        return routes



[docs]
    def route_for(self, sru_id: int) -> OffloadRoute | None:
        """The cached route for ``sru_id`` from the last :meth:`plan`, or ``None``.

        ``None`` means "no decision recorded" (the SRU was not in the planned
        list); callers treat that as "render on the primary", today's path.
        """
        return self._routes.get(sru_id)



[docs]
    def transfer_for(self, route: OffloadRoute, *, width: int = 0, height: int = 0) -> CrossDeviceTransfer:
        """Build the :class:`CrossDeviceTransfer` for an offloaded ``route``.

        Pairs the secondary (``route.device_index``) and primary (0) device slots
        with the chosen transfer method so the rig wiring is a single seam.
        """
        return CrossDeviceTransfer(
            src=self._manager.slot(route.device_index),
            dst=self._manager.primary,
            method=route.transfer,
            width=width,
            height=height,
        )



[docs]
    def render_offloaded(self, route: OffloadRoute) -> Any:
        """Return the per-device renderer for an offloaded SRU, building it if needed.

        Resolution order:

        1. An explicitly attached renderer (``MultiDeviceManager.attach_renderer``)
           is returned as-is (the rig may pre-build + bind one; also the unit-test
           seam).
        2. Otherwise, if a :attr:`_secondary_renderer_factory` was injected (rig),
           lazily build the secondary :class:`SecondaryRenderContext` facade, run the
           factory to construct + ``setup()`` a ``Renderer(facade)`` on the secondary
           device, bind it to the slot, and return it.
        3. With neither (this single-GPU box: no factory is ever injected because no
           secondary device exists), raise the clear, capability-gated rig-completion
           error. NEVER reached on the single-GPU / unopted path: that path has no
           coordinator at all, so the seam's ``coordinator is None`` branch keeps it
           byte-identical.
        """
        slot = self._manager.slot(route.device_index)
        if slot.renderer is not None:
            return slot.renderer
        ctx = self._ensure_secondary(route.device_index)
        if ctx is None or ctx["facade"].renderer is None:
            raise NotImplementedError(
                f"Multi-GPU SubViewport offload to secondary device {route.device_index} "
                f"({slot.name!r}) needs that device's own renderer (pipelines / descriptor "
                "pools / transform+material SSBOs / mesh+texture residency). Inject a "
                "secondary_renderer_factory (the engine does on the rig) or bind one via "
                "MultiDeviceManager.attach_renderer(index, renderer); see multi_device.py "
                "module docstring + design decision D8. Verified on the 4x Arc Pro B70 rig."
            )
        return ctx["facade"].renderer


    def _ensure_secondary(self, device_index: int) -> dict | None:
        """Lazily build + cache the secondary render context for ``device_index``.

        Builds (once) the :class:`SecondaryRenderContext` facade and, when a
        :attr:`_secondary_renderer_factory` is present, a ``Renderer(facade)`` set up
        on the secondary device + a :class:`SecondaryResidency` helper. Returns the
        cached dict, or ``None`` when no factory is available (no secondary renderer
        can exist; the caller raises the rig-completion error). GPU work happens only
        inside the injected factory, so this stays import-safe and is never reached on
        the single-GPU path.
        """
        cached = self._secondary.get(device_index)
        if cached is not None:
            return cached
        if self._secondary_renderer_factory is None:
            return None
        from .secondary_engine import SecondaryRenderContext, SecondaryResidency

        slot = self._manager.slot(device_index)
        facade = SecondaryRenderContext(
            slot, capabilities=self._capabilities, content_scale=self._content_scale,
        )
        facade.ensure_command_pool()
        # The render pass MUST exist before the factory builds Renderer(facade).setup():
        # pipelines compile against facade.render_pass (it is format-compatible with the
        # per-SRU RenderTarget; pipelines use dynamic viewport/scissor so extent is free).
        facade.ensure_offscreen_render_pass()
        renderer = self._secondary_renderer_factory(facade)
        facade.attach_renderer(renderer)
        cached = {"facade": facade, "residency": SecondaryResidency(facade), "target": None, "transfer": None}
        self._secondary[device_index] = cached
        return cached


[docs]
    def render_sru_offloaded(self, sru: Any, primary_dst_image: Any) -> bool:
        """Render one SRU on its secondary device and transfer it to the primary image.

        End-to-end multi-GPU SubViewport offload for one SRU (the
        :class:`~simvx.graphics.renderer.render_packet.SubViewportSRU` plan):

        1. Resolve the SRU's route (must be offloaded; else returns ``False`` so the
           caller takes the primary path).
        2. Ensure the secondary render context (facade + ``Renderer`` + residency).
        3. Mirror the SRU's mesh geometry (and, when textured-residency is wired, its
           sampled textures) onto the secondary device via :class:`SecondaryResidency`.
        4. Size / create the secondary offscreen :class:`RenderTarget` for this SRU.
        5. Record + submit the SRU's draws into that target on the secondary device
           (one submit on the secondary graphics queue; the offscreen RenderTarget
           leaves the colour image in ``SHADER_READ_ONLY_OPTIMAL``).
        6. Run the :class:`CrossDeviceTransfer` (staging-copy floor) to move the
           secondary colour image into ``primary_dst_image`` (the SubViewport's
           primary-device bindless image the main scene samples).

        Returns ``True`` when the SRU was handled on a secondary, ``False`` when it
        was not offloaded (caller renders it on the primary as today). The actual
        per-device GPU record + submit (step 5) is delegated to the injected secondary
        renderer; this method owns the orchestration + the cross-device transfer.
        Exercised on the 4x Arc Pro B70 rig; never reached on the single-GPU path.
        """
        sru_id = getattr(sru, "sru_id", None)
        route = self.route_for(sru_id) if sru_id is not None else None
        if route is None or not route.offloaded:
            return False
        ctx = self._ensure_secondary(route.device_index)
        if ctx is None or ctx["facade"].renderer is None:
            # No secondary renderer -> defer to render_offloaded's clear raise so the
            # caller surfaces the precise rig-completion message rather than silently
            # dropping the SRU.
            self.render_offloaded(route)
            return False

        facade = ctx["facade"]
        residency = ctx["residency"]
        width, height = int(sru.width), int(sru.height)

        # 3. Mirror geometry the SRU draws onto the secondary device. Vertex-colour
        # SRUs need only meshes; textured-SRU residency mirrors sampled textures too
        # (FLAGGED: needs source pixels, see SecondaryResidency.ensure_textures). The
        # source CPU arrays come from the primary registry's retained geometry (the
        # engine enables retention when multi-GPU is active); a VkBuffer cannot cross
        # devices, only the arrays can.
        residency.ensure_meshes(self._sru_mesh_payloads(sru, self._primary_mesh_registry()))

        # 4. Secondary offscreen target sized to the SRU (recreate on resize).
        target = ctx["target"]
        if target is None or (target.width, target.height) != (width, height):
            from ..renderer.render_target import RenderTarget

            if target is not None:
                target.destroy()
            target = RenderTarget(
                facade.ctx.device, facade.ctx.physical_device, width, height,
                colour_format=vk.VK_FORMAT_R16G16B16A16_SFLOAT,
                use_depth=True, samplable_depth=True,
                queue=facade.ctx.graphics_queue, command_pool=facade.ctx.command_pool,
            )
            ctx["target"] = target
            ctx["transfer"] = None  # size changed -> staging must be re-sized

        # 5. Record + submit the SRU on the secondary renderer into ``target``. The
        # injected secondary-renderer factory returns an object exposing
        # ``render_sru_offscreen(sru, target)`` (duck-typed): on the rig it wraps a
        # ``Renderer(facade)`` and mirrors the primary ``render_sru_from_plan`` slice
        # model on its OWN device (reserve a transform-SSBO slice, upload the SRU
        # transforms, record ``render_scene_content`` into ``target``, submit on the
        # secondary graphics queue + wait). The core shared ``Renderer`` is NOT made
        # to grow this method, so the single-GPU path stays byte-identical.
        recorder = facade.renderer
        # Bind residency (for primary->secondary MeshHandle remap) + mirror the
        # primary material/light SSBO contents so the offloaded draws shade
        # identically (set once per build is insufficient: materials change per
        # frame, so mirror every frame from the primary renderer).
        if hasattr(recorder, "set_residency"):
            recorder.set_residency(residency)
        primary_renderer = self._manager.primary.renderer
        if primary_renderer is not None and hasattr(recorder, "set_materials"):
            recorder.set_materials(getattr(primary_renderer, "_materials", None))
            recorder.set_lights(getattr(primary_renderer, "_lights", None))
        recorder.render_sru_offscreen(sru, target)

        # 6. Transfer the secondary colour image into the primary bindless image.
        transfer = ctx["transfer"]
        if transfer is None:
            transfer = self.transfer_for(route, width=width, height=height)
            transfer.bytes_per_pixel = 8  # R16G16B16A16_SFLOAT
            ctx["transfer"] = transfer
        transfer.ensure_staging(facade.ctx.command_pool, self._primary_pool())
        transfer.run(src_image=target.colour_image, dst_image=primary_dst_image)
        return True


    def _primary_pool(self) -> Any:
        """The primary device's command pool (for the destination half of the copy)."""
        primary = self._manager.primary
        renderer = primary.renderer
        # The primary renderer's engine exposes ctx.command_pool; resolve via the
        # attached renderer's engine when present, else the slot has no pool handle
        # and the rig wiring supplies it. Kept as a single resolution seam.
        eng = getattr(renderer, "_engine", None)
        ctx = getattr(eng, "ctx", None)
        if ctx is not None:
            return ctx.command_pool
        raise RuntimeError(
            "primary command pool unavailable for cross-device transfer; attach the "
            "primary renderer to slot 0 (MultiDeviceManager.attach_renderer(0, renderer))."
        )

    def _primary_mesh_registry(self) -> Any:
        """The primary renderer's :class:`MeshRegistry` (source of retained geometry).

        Resolved off the primary slot's attached renderer engine
        (``renderer._engine.mesh_registry``). The engine enables geometry retention
        on that registry when multi-GPU is active, so :meth:`get_geometry` returns the
        CPU arrays an offloaded SRU's meshes need to be mirrored onto a secondary
        device. ``None`` when no primary renderer is attached (the residency mirror
        then skips with a warning rather than indexing the wrong device's buffers).
        """
        renderer = self._manager.primary.renderer
        eng = getattr(renderer, "_engine", None)
        return getattr(eng, "mesh_registry", None)

    @staticmethod
    def _sru_mesh_payloads(sru: Any, primary_registry: Any) -> list[tuple[int, Any, Any]]:
        """Extract ``(residency_key, vertices, indices)`` payloads from an SRU's instances.

        ``residency_key`` is ``id(mh)`` (the per-instance handle identity the secondary
        recorder remaps against in :meth:`SecondarySRURenderer._remap_instances`), and
        the CPU ``vertices``/``indices`` are fetched from the primary
        :class:`~simvx.graphics.renderer.mesh_registry.MeshRegistry` by the handle's
        registry ``id`` (the device-independent source arrays the registry retained
        when multi-GPU is active). A handle whose geometry is not retained (no
        ``primary_registry`` or a registry without retention) is skipped, and the
        recorder drops that instance with a warning rather than indexing the wrong
        device's buffers. Dedup by ``id(mh)`` so a shared mesh mirrors once per device.
        """
        if primary_registry is None or not hasattr(primary_registry, "get_geometry"):
            return []
        payloads: list[tuple[int, Any, Any]] = []
        seen: set[int] = set()
        for entry in list(getattr(sru, "instances", [])) + list(getattr(sru, "skinned_instances", [])):
            mh = entry[0]
            key = id(mh)
            if key in seen:
                continue
            seen.add(key)
            geom = primary_registry.get_geometry(getattr(mh, "id", None))
            if geom is not None:
                payloads.append((key, geom[0], geom[1]))
        return payloads


[docs]
    def destroy(self) -> None:
        """Tear down all lazily-built secondary render contexts (facade + target + transfer)."""
        for ctx in self._secondary.values():
            transfer = ctx.get("transfer")
            if transfer is not None:
                transfer.destroy()
            target = ctx.get("target")
            if target is not None:
                target.destroy()
            facade = ctx.get("facade")
            if facade is not None:
                facade.destroy()
        self._secondary.clear()