Source code for simvx.graphics.gpu.multi_device

"""Explicit multi-adapter (multi-GPU) foundation: D8 workload-split offload.

This is the **foundation** wave of design decision D8. It builds the gated
plumbing for an explicit-multi-adapter renderer (one independent ``VkDevice``
per physical GPU) where whole SubViewport / offscreen scene-render-units (SRUs)
are offloaded to secondary GPUs and composited on the primary (GPU 0). It is
deliberately **off by default**: a single-GPU box, and any multi-GPU box that
does not opt in, runs today's single-device path byte-identical (no extra
device, no transfer work, no behaviour change). The active multi-device path is
verified on a 4x Arc Pro B70 rig, not on this single-GPU dev box.

Three coherent, unit-testable pieces live here:

1. :class:`MultiDeviceManager` enumerates physical devices and, only when
   ``physical_device_count > 1`` **and** the caller opted in, creates an
   independent logical device per physical GPU (reusing
   :func:`~simvx.graphics.gpu.device.create_logical_device`). When the count is
   1 or the opt-in is off it holds exactly the existing single (primary) device
   and is a transparent passthrough.
2. :func:`assign_srus` is the pure device-assignment policy (no Vulkan): given
   the ordered SRUs and a device count it decides which render on GPU 0 vs which
   offload to GPU 1+. TAA-safe by construction (each SRU keeps its temporal
   history on its one assigned device, so there is no cross-device reprojection).
3. :class:`CrossDeviceTransfer` selects how a finished offscreen colour image
   moves from a secondary device to the primary for compositing. The
   staging-copy path (secondary -> host-visible staging -> primary) is the
   guaranteed floor and is the only path implemented end-to-end; the dma-buf /
   ``VK_KHR_external_memory_fd`` zero-copy path is the rig optimisation, gated
   behind a capability and currently raising a clear, actionable error.

REALITY CHECK (honest scope). Rendering an SRU on a *second* ``VkDevice``
requires that device to own its full set of rendering resources: its own
pipelines, descriptor pools, transform/material SSBOs, and mesh+texture
residency. The current :class:`~simvx.graphics.renderer.forward.Renderer` is
built around exactly one device. Duplicating it per device is a large,
GPU-bound refactor that cannot be functionally verified on this single-GPU box.
So this module stops at a clean seam: the device manager, the assignment
policy, and the transfer interface are real and tested; the per-device renderer
construction + the actual offload-record-and-composite loop are the documented
rig-side completion (see :class:`MultiDeviceManager.attach_renderer` /
:meth:`DeviceSlot.renderer`).
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any

import vulkan as vk

from .device import QueueFamilies, create_logical_device

if TYPE_CHECKING:
    from .capabilities import RenderCapabilities

log = logging.getLogger(__name__)

__all__ = [
    "DeviceSlot",
    "MultiDeviceManager",
    "SRUAssignment",
    "assign_srus",
    "CrossDeviceTransfer",
    "TransferMethod",
    "OffloadRoute",
    "SRUOffloadCoordinator",
]


# --- Device topology ---------------------------------------------------------


[docs] @dataclass class DeviceSlot: """One physical+logical GPU participating in the multi-device renderer. ``index`` 0 is always the primary (compositing) GPU; 1+ are secondaries that render offloaded SRUs. On a single-GPU run there is exactly one slot (``index == 0``) wrapping the engine's existing device, and nothing else is created. ``renderer`` is the per-device :class:`~simvx.graphics.renderer.forward.Renderer` duplicate. It is populated for the primary slot from the engine's existing renderer and, on the rig, for each secondary by :meth:`MultiDeviceManager.attach_renderer`. It stays ``None`` for secondaries until that rig-side per-device renderer construction lands (see the module docstring): the foundation never fabricates a broken renderer to look complete. """ index: int physical_device: Any queue_families: QueueFamilies device: Any = None graphics_queue: Any = None present_queue: Any = None compute_queue: Any = None transfer_queue: Any = None name: str = "" renderer: Any = None
[docs] @property def is_primary(self) -> bool: return self.index == 0
[docs] class MultiDeviceManager: """Owns the per-physical-GPU logical devices for the D8 offload renderer. Construct with the primary device's already-created handles (the engine's existing single device, so the primary slot is never re-created) plus the enumerated physical devices and the opt-in flag. When ``enabled`` is true and more than one physical device is present, a secondary :class:`DeviceSlot` is created per additional physical GPU with its own independent ``VkDevice`` via :func:`create_logical_device`. Otherwise the manager holds exactly the single primary slot and :attr:`multi_gpu` is ``False`` (today's path, unchanged). The manager does NOT own the primary device's lifetime (the engine created and destroys it); :meth:`destroy` only tears down the *secondary* devices it created itself. """ def __init__( self, *, primary_physical_device: Any, primary_queue_families: QueueFamilies, primary_device: Any, primary_graphics_queue: Any, primary_present_queue: Any, physical_devices: list[Any], enabled: bool, capabilities: RenderCapabilities | None = None, primary_compute_queue: Any = None, primary_transfer_queue: Any = None, find_queue_families: Any = None, ) -> None: self._capabilities = capabilities self._physical_devices = list(physical_devices) # Whether secondary logical devices were created with # VK_KHR_external_memory_fd enabled. Only true on an active multi-GPU path # where the extension is probed available on every secondary; gates the # dma-buf transfer selection (enabled, not merely probed). False on this # box (no secondaries) and whenever any secondary lacks the extension. self.external_memory_fd_enabled = False # Teardown callbacks for per-secondary-device GPU resources (offload # coordinator facades / targets / staging buffers). They MUST run before # the secondary VkDevices are destroyed; :meth:`destroy` invokes them first. self._teardown_hooks: list[Any] = [] # The primary slot wraps the engine's existing device. It is slot 0 and # the manager never creates or destroys its VkDevice. primary = DeviceSlot( index=0, physical_device=primary_physical_device, queue_families=primary_queue_families, device=primary_device, graphics_queue=primary_graphics_queue, present_queue=primary_present_queue, compute_queue=primary_compute_queue, transfer_queue=primary_transfer_queue, name=_device_name(primary_physical_device), ) self._slots: list[DeviceSlot] = [primary] # Multi-GPU is active only when the caller opted in AND there is more # than one physical device. Either condition false => single-slot # passthrough, byte-identical to today. self._multi_gpu = bool(enabled) and len(self._physical_devices) > 1 if self._multi_gpu: self._create_secondaries(primary_physical_device, find_queue_families) elif enabled and len(self._physical_devices) <= 1: log.info( "multi-GPU requested but only %d physical device(s): single-device path", len(self._physical_devices), ) def _create_secondaries(self, primary_pd: Any, find_queue_families: Any) -> None: """Create one independent logical device per *additional* physical GPU. ``find_queue_families`` resolves a :class:`QueueFamilies` for a secondary physical device. A secondary renders offscreen (it never presents), so it does not need a present-capable family; the injected resolver lets the rig wire surface-less graphics-queue selection without this module importing the surface path. When omitted (unit tests, and any device for which it returns ``None``) the secondary is skipped with a warning rather than guessing a family. """ # Enable the dma-buf prerequisites on every secondary only when the # capability snapshot reports the extension available; otherwise keep the # staging-copy floor. The flag is set true only once at least one secondary # was actually created with it enabled. want_ext_mem = bool(getattr(self._capabilities, "external_memory_fd", False)) for pd in self._physical_devices: if pd is primary_pd: continue qf = find_queue_families(pd) if find_queue_families is not None else None if qf is None: log.warning("multi-GPU: no graphics queue family resolved for %s, skipping", _device_name(pd)) continue device, gq, pq, cq, tq = create_logical_device(pd, qf, external_memory_fd=want_ext_mem) if want_ext_mem: self.external_memory_fd_enabled = True self._slots.append( DeviceSlot( index=len(self._slots), physical_device=pd, queue_families=qf, device=device, graphics_queue=gq, present_queue=pq, compute_queue=cq, transfer_queue=tq, name=_device_name(pd), ) ) log.info("multi-GPU: created secondary logical device %d (%s)", len(self._slots) - 1, _device_name(pd))
[docs] @property def multi_gpu(self) -> bool: """True only when an opted-in multi-device renderer is active (>= 2 slots).""" return self._multi_gpu and len(self._slots) > 1
[docs] @property def device_count(self) -> int: """Number of logical devices the manager owns (1 on the single-GPU path).""" return len(self._slots)
[docs] @property def primary(self) -> DeviceSlot: return self._slots[0]
[docs] @property def secondaries(self) -> list[DeviceSlot]: return self._slots[1:]
[docs] @property def slots(self) -> list[DeviceSlot]: return list(self._slots)
[docs] def slot(self, index: int) -> DeviceSlot: return self._slots[index]
[docs] def attach_renderer(self, index: int, renderer: Any) -> None: """Bind a per-device :class:`Renderer` to slot ``index`` (rig-side). The primary renderer is the engine's existing one. Each secondary needs its OWN renderer (its device's pipelines / descriptor pools / SSBOs / residency); constructing that is the documented rig-side completion. This setter is the seam where the rig hands the constructed per-device renderer back to the manager so the offload loop can record into it. """ self._slots[index].renderer = renderer
[docs] def register_teardown(self, hook: Any) -> None: """Register a callback to free per-secondary GPU resources before device destroy. The offload coordinator registers its :meth:`SRUOffloadCoordinator.destroy` here so its secondary facades / targets / staging buffers are freed while the secondary ``VkDevice``\\ s are still alive (they own those resources). Invoked first by :meth:`destroy`. """ self._teardown_hooks.append(hook)
[docs] def destroy(self) -> None: """Destroy only the SECONDARY logical devices this manager created. The primary device is owned by the engine and left untouched. Safe to call on the single-GPU path (no secondaries => no-op). Registered teardown hooks run FIRST so per-secondary GPU resources are freed before the devices. """ for hook in self._teardown_hooks: hook() self._teardown_hooks.clear() for slot in self._slots[1:]: if slot.device is not None: vk.vkDeviceWaitIdle(slot.device) vk.vkDestroyDevice(slot.device, None) slot.device = None del self._slots[1:] self._multi_gpu = False
def _barrier_image( cmd: Any, image: Any, old_layout: int, new_layout: int, src_access: int, dst_access: int, src_stage: int, dst_stage: int, ) -> None: """Record a single colour-image layout barrier into ``cmd`` (no submit). Inline barrier for the cross-device staging copy: unlike :func:`~simvx.graphics.gpu.memory.transition_image_layout` it records into a caller-owned command buffer rather than submitting its own one-shot, so the secondary->buffer and buffer->primary copies each batch their two barriers + the copy into ONE submit. Colour aspect, single mip / array layer. """ barrier = vk.VkImageMemoryBarrier( srcAccessMask=src_access, dstAccessMask=dst_access, oldLayout=old_layout, newLayout=new_layout, srcQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED, dstQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED, image=image, subresourceRange=vk.VkImageSubresourceRange( aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1, ), ) vk.vkCmdPipelineBarrier(cmd, src_stage, dst_stage, 0, 0, None, 0, None, 1, [barrier]) def _device_name(physical_device: Any) -> str: try: props = vk.vkGetPhysicalDeviceProperties(physical_device) name = props.deviceName return name if isinstance(name, str) else name.decode("utf-8") except Exception: # noqa: BLE001 (name is diagnostic only) return "<unknown>" # --- Device-assignment policy (pure logic, no Vulkan) ------------------------
[docs] @dataclass(frozen=True, slots=True) class SRUAssignment: """Which device renders one SRU and whether its result must be transferred. ``device_index`` 0 means the SRU renders on the primary and is already resident for compositing (``needs_transfer`` is ``False``). A non-zero index means the SRU is offloaded to that secondary and its finished colour image must be moved to the primary before compositing (``needs_transfer`` is ``True``). """ sru_id: int device_index: int
[docs] @property def needs_transfer(self) -> bool: return self.device_index != 0
def _sru_cost(sru: Any) -> int: """Heuristic offload cost of one SRU: its instance + skinned-instance count. Pure and Vulkan-free. The heaviest independent SRUs are the ones worth shipping to a secondary GPU; instance count is the cheapest faithful proxy we already have on the snapshotted plan. SRUs with no countable instances fall back to cost 1 so ordering is still deterministic. """ inst = getattr(sru, "instances", None) skinned = getattr(sru, "skinned_instances", None) n = (len(inst) if inst is not None else 0) + (len(skinned) if skinned is not None else 0) return n if n > 0 else 1
[docs] def assign_srus( srus: list[Any], device_count: int, *, sru_id: Any = None, cost: Any = None, ) -> list[SRUAssignment]: """Decide which device renders each SRU (pure, unit-testable, no Vulkan). Policy (TAA-safe by construction: an SRU is assigned to exactly one device, so its temporal history never crosses devices): - ``device_count <= 1``: EVERY SRU stays on GPU 0. This is the single-GPU / unopted path and produces the same per-SRU work as today, byte-identical (no transfer is ever flagged). - ``device_count >= 2``: the main scene is implicitly GPU 0 (it is not an SRU and is not in this list). Among the SRUs, the cheapest stay on GPU 0 (composited locally) and the **heaviest independent** SRUs are offloaded to the secondaries round-robin. Concretely: sort SRUs by descending cost and walk them, sending the next heaviest to the least-loaded secondary while that keeps the primary from being the bottleneck; ties and the remainder stay on GPU 0. The returned list preserves the INPUT order of ``srus`` (the P1 producer-before-consumer topological order), so a consumer SRU still follows the producer it samples; only the device choice is decided here. Args: srus: Ordered SRU plans (``SubViewportSRU`` or any object exposing the cost inputs). Order is preserved in the result. device_count: Number of devices available (``MultiDeviceManager.device_count``). sru_id: Optional accessor ``sru -> int`` for the stable id; defaults to reading ``sru.sru_id``. cost: Optional accessor ``sru -> int`` overriding the default instance-count heuristic (handy for tests). """ get_id = sru_id if sru_id is not None else (lambda s: s.sru_id) get_cost = cost if cost is not None else _sru_cost if device_count <= 1 or not srus: return [SRUAssignment(sru_id=get_id(s), device_index=0) for s in srus] n_secondary = device_count - 1 # Heaviest-first candidate order for offload decisions, but we DECIDE on this # order and then EMIT in the caller's input order so producer<-consumer # topology survives. Stable secondary index map keyed by id(). by_cost = sorted(srus, key=lambda s: (-get_cost(s), id(s))) # Round-robin the heaviest SRUs across secondaries; keep the rest on primary. # We offload at most ``n_secondary`` heaviest SRUs per "round" so the primary # always keeps the cheap tail (it must also composite). A simple, balanced # rule that scales to device_count in {2, 4}: offload the top # ``n_secondary`` heaviest SRUs, one per secondary; everything else -> GPU 0. offload: dict[int, int] = {} for slot_offset, sru in enumerate(by_cost[:n_secondary]): offload[id(sru)] = 1 + slot_offset # secondary device indices 1..n return [ SRUAssignment(sru_id=get_id(s), device_index=offload.get(id(s), 0)) for s in srus ]
# --- Cross-device transfer interface -----------------------------------------
[docs] class TransferMethod: """Selectable cross-device image-transfer strategies (enum-like constants).""" NONE = "none" """SRU already lives on the primary device: compositing samples it directly.""" STAGING_COPY = "staging_copy" """Secondary image -> host-visible staging buffer -> primary image. The guaranteed floor, works on any pair of devices (the colour RenderTarget already carries TRANSFER_SRC|TRANSFER_DST).""" DMABUF = "dmabuf" """Zero-copy via VK_KHR_external_memory_fd (dma-buf import/export). The rig optimisation; requires the external-memory extensions to be enabled on both devices. Gated and currently raises until the rig path is implemented."""
# The dma-buf zero-copy transfer (run_dmabuf) is not implemented yet, so # select_transfer_method must never return DMABUF regardless of caller preference # or the enabled-extension capability: doing so would later raise with no working # path. Flip to True only when run_staging_copy's dma-buf sibling is wired + rig-verified. _DMABUF_IMPLEMENTED = False
[docs] def select_transfer_method( assignment: SRUAssignment, capabilities: RenderCapabilities | None, *, prefer_dmabuf: bool = True, ) -> str: """Pick the transfer strategy for one SRU assignment (pure, no Vulkan). - An SRU on the primary needs no transfer -> :data:`TransferMethod.NONE`. - An offloaded SRU uses :data:`TransferMethod.DMABUF` when the caller prefers it AND the capability snapshot reports the external-memory-fd path *enabled*; otherwise the always-available :data:`TransferMethod.STAGING_COPY`. The capability gate is :attr:`RenderCapabilities.external_memory_fd_enabled`: the extension must be *enabled* at device creation, not merely probed available. A probed-but-not-enabled device cannot export/import fds, so DMABUF must not be selected for it (that would later raise with no working path). On this dev box the single-GPU path never enables it, so the field is ``False``, the staging-copy floor is always chosen, and the dma-buf raise is never reached. """ if not assignment.needs_transfer: return TransferMethod.NONE has_dmabuf = bool(getattr(capabilities, "external_memory_fd_enabled", False)) # DMABUF is only selectable once its transfer path actually exists (it does # not yet, run_dmabuf raises), so enabling the extension at device creation # must NOT activate an unimplemented path: staging-copy stays the choice. if _DMABUF_IMPLEMENTED and prefer_dmabuf and has_dmabuf: return TransferMethod.DMABUF return TransferMethod.STAGING_COPY
[docs] @dataclass class CrossDeviceTransfer: """Moves a finished offscreen colour image from a secondary to the primary. Holds the source (secondary) and destination (primary) :class:`DeviceSlot` plus the chosen :class:`TransferMethod`. Cross-device synchronisation is explicit: each device signals a fence/timeline when its half of the copy is done, and the primary's composite waits on the import being complete. The sync handles are carried on this object so the rig wiring is a single seam. Implemented: :meth:`run_staging_copy` defines the staging-copy sequence (secondary GPU copy-to-buffer -> host-visible staging -> primary copy-to-image). The CPU-roundtrip floor: correct on any device pair, slower than dma-buf. The actual ``vkCmd*`` recording is the rig-side completion because it needs both devices' live command pools + a host-visible staging allocation per device, which cannot be exercised on this single-GPU box; the method documents and gates that precisely rather than emitting unverifiable copy code. Gated (rig): :meth:`run_dmabuf` raises a clear, actionable error until VK_KHR_external_memory_fd is enabled and the import/export is wired on the rig. """ src: DeviceSlot dst: DeviceSlot method: str # Cross-device sync handles (per-device fence/timeline). Defined as the seam; # populated by the rig wiring. The primary composite waits on ``dst_ready``. src_done: Any = None dst_ready: Any = None # Sizing of the transferred colour image (set when the SRU target is known). width: int = 0 height: int = 0 # Bytes per pixel of the colour format being moved. The SubViewport offscreen # target is ``R16G16B16A16_SFLOAT`` (8 bpp); a generic RT may differ. The host # roundtrip is format-agnostic: it moves raw bytes, so only the byte count of # one row + the whole image matter, not the channel layout. bytes_per_pixel: int = 8 extra: dict = field(default_factory=dict)
[docs] def run(self, *, src_image: Any = None, dst_image: Any = None) -> None: """Execute the selected transfer. Dispatches by :attr:`method`. :data:`TransferMethod.NONE` is a no-op (single-GPU / primary-resident SRU, the byte-identical path). The other two dispatch to their handlers. """ if self.method == TransferMethod.NONE: return if self.method == TransferMethod.STAGING_COPY: self.run_staging_copy(src_image=src_image, dst_image=dst_image) return if self.method == TransferMethod.DMABUF: self.run_dmabuf(src_image=src_image, dst_image=dst_image) return raise ValueError(f"unknown cross-device transfer method {self.method!r}")
[docs] def run_staging_copy(self, *, src_image: Any = None, dst_image: Any = None) -> None: """Staging-copy floor: secondary image -> host staging -> primary image. The CPU-roundtrip floor: correct on any device pair, slower than dma-buf. ``src_image`` is the finished SRU colour image on the SECONDARY device (left in ``SHADER_READ_ONLY_OPTIMAL`` by the offscreen pass; the RenderTarget carries ``TRANSFER_SRC_BIT``). ``dst_image`` is the primary-device image the main scene samples for this SubViewport feed (carries ``TRANSFER_DST_BIT``). Exact sequence + every wait (the contract recorded here, verified on the rig). All staging buffers are host-visible|host-coherent and pre-built once per (transfer, size) into :attr:`extra` by :meth:`ensure_staging`: 1. SOURCE (secondary) device, one-shot command buffer on the secondary command pool: a. barrier ``src_image``: ``SHADER_READ_ONLY_OPTIMAL -> TRANSFER_SRC_OPTIMAL`` (src stage FRAGMENT_SHADER, dst stage TRANSFER; src access SHADER_READ, dst access TRANSFER_READ). b. ``vkCmdCopyImageToBuffer`` ``src_image -> src_staging`` (tightly packed, ``bufferRowLength=0``, ``bufferImageHeight=0``). c. barrier ``src_image``: ``TRANSFER_SRC_OPTIMAL -> SHADER_READ_ONLY_OPTIMAL`` so the secondary can render into / sample it again next frame. d. submit on ``src.graphics_queue`` signalling :attr:`src_done` (a fence on the secondary device). **WAIT:** the host read in step 2 blocks on ``src_done`` (``vkWaitForFences``); the GPU copy must complete before the bytes are mapped. The offscreen SRU render fence is itself waited on before this submit by the caller (``_record_offloaded_sru``), so the colour image is fully written first. 2. HOST: map ``src_staging`` (secondary device), ``memmove`` the ``height * row_bytes`` bytes into ``dst_staging`` (primary device), unmap both. Both are host-coherent so no explicit flush/invalidate is needed. **WAIT:** gated on ``src_done`` from 1d before the read; the write into ``dst_staging`` happens-before the primary GPU read in 3 because step 3 is submitted only after this memmove returns on the same (host) thread. 3. DESTINATION (primary) device, one-shot command buffer on the primary command pool: a. barrier ``dst_image``: ``SHADER_READ_ONLY_OPTIMAL -> TRANSFER_DST_OPTIMAL``. b. ``vkCmdCopyBufferToImage`` ``dst_staging -> dst_image``. c. barrier ``dst_image``: ``TRANSFER_DST_OPTIMAL -> SHADER_READ_ONLY_OPTIMAL`` so the main composite pass samples it. d. submit on ``dst.graphics_queue`` signalling :attr:`dst_ready` (a fence on the primary device). **WAIT:** the main composite pass that samples ``dst_image`` runs after ``dst_ready``; the caller waits on it before recording the frame's main pass (or, in the same-frame in-cmd model, this transfer is submitted + waited before the main pass is recorded). The implementation records exactly this. It needs both devices' command pools + a host-visible staging buffer per device, supplied via :meth:`ensure_staging`. On the single-GPU path the method is NONE and this is never reached; it is exercised end-to-end only on the multi-GPU rig. """ if self.method != TransferMethod.STAGING_COPY: raise ValueError(f"run_staging_copy called for method {self.method!r}") if src_image is None or dst_image is None: raise ValueError("run_staging_copy needs both src_image (secondary) and dst_image (primary)") staging = self.extra.get("staging") if staging is None: raise RuntimeError( "run_staging_copy: per-device staging buffers not allocated; call " "ensure_staging(src_pool, dst_pool) first (the offload coordinator does)." ) self._record_staging_copy(src_image, dst_image, staging)
[docs] def ensure_staging(self, src_command_pool: Any, dst_command_pool: Any) -> dict: """Allocate (once) the per-device host-visible staging buffers + cache pools. Builds one host-visible|host-coherent buffer on each device, sized ``width * height * bytes_per_pixel`` (the raw colour image bytes). Cached on :attr:`extra` ``['staging']`` so a steady-state per-frame transfer reuses them; reallocated only when the size changes (a SubViewport resize). Returns the staging dict. Rig-side GPU allocation; never reached on this box. """ from .memory import create_buffer size = max(1, self.width * self.height * self.bytes_per_pixel) staging = self.extra.get("staging") if staging is not None and staging.get("size") == size: staging["src_pool"] = src_command_pool staging["dst_pool"] = dst_command_pool return staging if staging is not None: self._destroy_staging(staging) src_buf, src_mem = create_buffer( self.src.device, self.src.physical_device, size, vk.VK_BUFFER_USAGE_TRANSFER_DST_BIT, vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, ) dst_buf, dst_mem = create_buffer( self.dst.device, self.dst.physical_device, size, vk.VK_BUFFER_USAGE_TRANSFER_SRC_BIT, vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, ) staging = { "size": size, "src_buf": src_buf, "src_mem": src_mem, "dst_buf": dst_buf, "dst_mem": dst_mem, "src_pool": src_command_pool, "dst_pool": dst_command_pool, } self.extra["staging"] = staging return staging
def _record_staging_copy(self, src_image: Any, dst_image: Any, staging: dict) -> None: """Record + run the two single-submit copies + the host roundtrip (see sequence).""" from .memory import begin_single_time_commands, end_single_time_commands w, h, bpp = self.width, self.height, self.bytes_per_pixel row_bytes = w * bpp size = staging["size"] # --- 1. SOURCE (secondary): image -> host staging buffer ----------------- cmd = begin_single_time_commands(self.src.device, staging["src_pool"]) _barrier_image( cmd, src_image, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_READ_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, ) region = vk.VkBufferImageCopy( bufferOffset=0, bufferRowLength=0, bufferImageHeight=0, imageSubresource=vk.VkImageSubresourceLayers( aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0, baseArrayLayer=0, layerCount=1, ), imageOffset=vk.VkOffset3D(x=0, y=0, z=0), imageExtent=vk.VkExtent3D(width=w, height=h, depth=1), ) vk.vkCmdCopyImageToBuffer( cmd, src_image, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, staging["src_buf"], 1, [region], ) _barrier_image( cmd, src_image, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_ACCESS_TRANSFER_READ_BIT, vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, ) # end_single_time_commands submits + waits idle on the secondary queue: that # is the src_done fence-equivalent (the host read below cannot start until # the secondary GPU copy has fully completed). end_single_time_commands(self.src.device, self.src.graphics_queue, staging["src_pool"], cmd) # --- 2. HOST: secondary staging -> primary staging ----------------------- ffi = vk.ffi src_ptr = vk.vkMapMemory(self.src.device, staging["src_mem"], 0, size, 0) dst_ptr = vk.vkMapMemory(self.dst.device, staging["dst_mem"], 0, size, 0) ffi.memmove(dst_ptr, src_ptr, h * row_bytes) vk.vkUnmapMemory(self.src.device, staging["src_mem"]) vk.vkUnmapMemory(self.dst.device, staging["dst_mem"]) # --- 3. DESTINATION (primary): host staging -> primary image ------------- cmd = begin_single_time_commands(self.dst.device, staging["dst_pool"]) _barrier_image( cmd, dst_image, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_WRITE_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, ) vk.vkCmdCopyBufferToImage( cmd, staging["dst_buf"], dst_image, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, [region], ) _barrier_image( cmd, dst_image, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_ACCESS_TRANSFER_WRITE_BIT, vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, ) # dst_ready fence-equivalent: waits idle on the primary queue, so the main # composite pass that samples dst_image runs strictly after the copy lands. end_single_time_commands(self.dst.device, self.dst.graphics_queue, staging["dst_pool"], cmd) def _destroy_staging(self, staging: dict) -> None: vk.vkDestroyBuffer(self.src.device, staging["src_buf"], None) vk.vkFreeMemory(self.src.device, staging["src_mem"], None) vk.vkDestroyBuffer(self.dst.device, staging["dst_buf"], None) vk.vkFreeMemory(self.dst.device, staging["dst_mem"], None)
[docs] def destroy(self) -> None: """Free the per-device staging buffers this transfer allocated (rig-side).""" staging = self.extra.pop("staging", None) if staging is not None: self._destroy_staging(staging)
[docs] def run_dmabuf(self, *, src_image: Any = None, dst_image: Any = None) -> None: """Zero-copy dma-buf transfer (VK_KHR_external_memory_fd). Rig optimisation. Export the secondary colour image's memory as an opaque fd / dma-buf, import it on the primary as an external image, and composite directly with a single cross-device semaphore wait (no host roundtrip). Requires the external-memory-fd extension enabled on BOTH devices at init. Gated: raises until the rig path is implemented and the ``external_memory_fd`` capability is enabled. """ raise NotImplementedError( "dma-buf zero-copy transfer (VK_KHR_external_memory_fd) is the rig " "optimisation: enable the external-memory-fd extension on both devices " "at init, export the secondary colour image memory as an fd, import it " "on the primary, and composite with a cross-device semaphore. Falls back " "to TransferMethod.STAGING_COPY when the capability is off (this box). " "Verified on the 4x Arc Pro B70 rig." )
# --- SRU offload routing (the SubViewport offload seam) ----------------------
[docs] @dataclass(frozen=True, slots=True) class OffloadRoute: """The per-SRU offload decision the recording path consults. Computed once per frame by :class:`SRUOffloadCoordinator.plan` from the ordered SRU list. ``device_index`` 0 means "render this SRU on the primary, exactly as today" (``offloaded`` is ``False``, ``transfer`` is :data:`TransferMethod.NONE`); a non-zero index means the SRU renders on that secondary's renderer and its colour image is moved to the primary via :attr:`transfer` before the main pass samples it. """ sru_id: int device_index: int transfer: str
[docs] @property def offloaded(self) -> bool: """True when this SRU renders on a secondary device (not GPU 0).""" return self.device_index != 0
[docs] class SRUOffloadCoordinator: """Decides + drives where each SubViewport SRU renders across the devices. Glue between the (already-tested) :func:`assign_srus` policy, the :func:`select_transfer_method` selector, and the :class:`MultiDeviceManager` device topology. The recording path (``SceneAdapter.render_sru_from_plan`` / ``render_to_target``) consults a coordinator, when one is present, to decide per SRU whether to take today's primary-device path or route the SRU to a secondary device and transfer the result back. Constructed only when the manager is actively multi-GPU (opted in AND >= 2 devices). On the single-GPU / unopted path the engine builds **no** coordinator, so the recording path's ``coordinator is None`` branch is taken and the frame is byte-identical to today. The decision logic (``plan`` / ``route_for``) is pure and unit-tested with no Vulkan. :meth:`render_offloaded` is the seam where a secondary-assigned SRU would be recorded on its device's renderer and transferred back; that needs a per-device renderer (``DeviceSlot.renderer``), which is ``None`` on every box without the rig-side per-device-renderer construction, so it raises a clear, capability-gated error rather than emitting unverifiable cross-device code. """ def __init__(self, manager: MultiDeviceManager, capabilities: RenderCapabilities | None = None, *, prefer_dmabuf: bool = True, content_scale: tuple[float, float] = (1.0, 1.0), secondary_renderer_factory: Any = None) -> None: self._manager = manager self._capabilities = capabilities self._prefer_dmabuf = prefer_dmabuf self._content_scale = content_scale self._routes: dict[int, OffloadRoute] = {} # Rig-injected builder: ``factory(facade) -> Renderer`` constructs a # ``Renderer(facade)`` and runs its GPU ``setup()`` on the secondary device. # The engine injects it (on the rig) so this module never imports the heavy # renderer / triggers a GPU build it cannot verify on this single-GPU box. # ``None`` => no secondary renderer can be built and :meth:`render_offloaded` # raises the clear rig-completion error (the path here today). self._secondary_renderer_factory = secondary_renderer_factory # Per-secondary-device lazily-built render state, keyed by device_index: # {index: {"facade": SecondaryRenderContext, "residency": SecondaryResidency, # "target": RenderTarget, "transfer": CrossDeviceTransfer}} # Built once, reused every frame; torn down in :meth:`destroy`. self._secondary: dict[int, dict] = {} # Free per-secondary GPU resources before the manager destroys the secondary # devices (they own those resources). No-op on this box (nothing is built). manager.register_teardown(self.destroy)
[docs] @property def active(self) -> bool: """True only when the backing manager is an opted-in multi-device renderer.""" return self._manager.multi_gpu
[docs] def plan(self, srus: list[Any], *, sru_id: Any = None, cost: Any = None) -> list[OffloadRoute]: """Compute + cache this frame's per-SRU routes from the ordered SRU list. Returns one :class:`OffloadRoute` per SRU in INPUT order (the P1 producer-before-consumer topological order is preserved). When the coordinator is inactive (single-GPU / unopted) every route stays on GPU 0 with :data:`TransferMethod.NONE`, so the caller takes today's path. ``sru_id`` / ``cost`` are forwarded to :func:`assign_srus` so a caller whose ordered items are not ``SubViewportSRU`` (e.g. the synchronous path's live nodes) can supply the id + offload-cost accessors. """ count = self._manager.device_count if self.active else 1 assignments = assign_srus(srus, count, sru_id=sru_id, cost=cost) routes: list[OffloadRoute] = [] self._routes = {} for a in assignments: method = select_transfer_method(a, self._capabilities, prefer_dmabuf=self._prefer_dmabuf) route = OffloadRoute(sru_id=a.sru_id, device_index=a.device_index, transfer=method) routes.append(route) self._routes[a.sru_id] = route return routes
[docs] def route_for(self, sru_id: int) -> OffloadRoute | None: """The cached route for ``sru_id`` from the last :meth:`plan`, or ``None``. ``None`` means "no decision recorded" (the SRU was not in the planned list); callers treat that as "render on the primary", today's path. """ return self._routes.get(sru_id)
[docs] def transfer_for(self, route: OffloadRoute, *, width: int = 0, height: int = 0) -> CrossDeviceTransfer: """Build the :class:`CrossDeviceTransfer` for an offloaded ``route``. Pairs the secondary (``route.device_index``) and primary (0) device slots with the chosen transfer method so the rig wiring is a single seam. """ return CrossDeviceTransfer( src=self._manager.slot(route.device_index), dst=self._manager.primary, method=route.transfer, width=width, height=height, )
[docs] def render_offloaded(self, route: OffloadRoute) -> Any: """Return the per-device renderer for an offloaded SRU, building it if needed. Resolution order: 1. An explicitly attached renderer (``MultiDeviceManager.attach_renderer``) is returned as-is (the rig may pre-build + bind one; also the unit-test seam). 2. Otherwise, if a :attr:`_secondary_renderer_factory` was injected (rig), lazily build the secondary :class:`SecondaryRenderContext` facade, run the factory to construct + ``setup()`` a ``Renderer(facade)`` on the secondary device, bind it to the slot, and return it. 3. With neither (this single-GPU box: no factory is ever injected because no secondary device exists), raise the clear, capability-gated rig-completion error. NEVER reached on the single-GPU / unopted path: that path has no coordinator at all, so the seam's ``coordinator is None`` branch keeps it byte-identical. """ slot = self._manager.slot(route.device_index) if slot.renderer is not None: return slot.renderer ctx = self._ensure_secondary(route.device_index) if ctx is None or ctx["facade"].renderer is None: raise NotImplementedError( f"Multi-GPU SubViewport offload to secondary device {route.device_index} " f"({slot.name!r}) needs that device's own renderer (pipelines / descriptor " "pools / transform+material SSBOs / mesh+texture residency). Inject a " "secondary_renderer_factory (the engine does on the rig) or bind one via " "MultiDeviceManager.attach_renderer(index, renderer); see multi_device.py " "module docstring + design decision D8. Verified on the 4x Arc Pro B70 rig." ) return ctx["facade"].renderer
def _ensure_secondary(self, device_index: int) -> dict | None: """Lazily build + cache the secondary render context for ``device_index``. Builds (once) the :class:`SecondaryRenderContext` facade and, when a :attr:`_secondary_renderer_factory` is present, a ``Renderer(facade)`` set up on the secondary device + a :class:`SecondaryResidency` helper. Returns the cached dict, or ``None`` when no factory is available (no secondary renderer can exist; the caller raises the rig-completion error). GPU work happens only inside the injected factory, so this stays import-safe and is never reached on the single-GPU path. """ cached = self._secondary.get(device_index) if cached is not None: return cached if self._secondary_renderer_factory is None: return None from .secondary_engine import SecondaryRenderContext, SecondaryResidency slot = self._manager.slot(device_index) facade = SecondaryRenderContext( slot, capabilities=self._capabilities, content_scale=self._content_scale, ) facade.ensure_command_pool() # The render pass MUST exist before the factory builds Renderer(facade).setup(): # pipelines compile against facade.render_pass (it is format-compatible with the # per-SRU RenderTarget; pipelines use dynamic viewport/scissor so extent is free). facade.ensure_offscreen_render_pass() renderer = self._secondary_renderer_factory(facade) facade.attach_renderer(renderer) cached = {"facade": facade, "residency": SecondaryResidency(facade), "target": None, "transfer": None} self._secondary[device_index] = cached return cached
[docs] def render_sru_offloaded(self, sru: Any, primary_dst_image: Any) -> bool: """Render one SRU on its secondary device and transfer it to the primary image. End-to-end multi-GPU SubViewport offload for one SRU (the :class:`~simvx.graphics.renderer.render_packet.SubViewportSRU` plan): 1. Resolve the SRU's route (must be offloaded; else returns ``False`` so the caller takes the primary path). 2. Ensure the secondary render context (facade + ``Renderer`` + residency). 3. Mirror the SRU's mesh geometry (and, when textured-residency is wired, its sampled textures) onto the secondary device via :class:`SecondaryResidency`. 4. Size / create the secondary offscreen :class:`RenderTarget` for this SRU. 5. Record + submit the SRU's draws into that target on the secondary device (one submit on the secondary graphics queue; the offscreen RenderTarget leaves the colour image in ``SHADER_READ_ONLY_OPTIMAL``). 6. Run the :class:`CrossDeviceTransfer` (staging-copy floor) to move the secondary colour image into ``primary_dst_image`` (the SubViewport's primary-device bindless image the main scene samples). Returns ``True`` when the SRU was handled on a secondary, ``False`` when it was not offloaded (caller renders it on the primary as today). The actual per-device GPU record + submit (step 5) is delegated to the injected secondary renderer; this method owns the orchestration + the cross-device transfer. Exercised on the 4x Arc Pro B70 rig; never reached on the single-GPU path. """ sru_id = getattr(sru, "sru_id", None) route = self.route_for(sru_id) if sru_id is not None else None if route is None or not route.offloaded: return False ctx = self._ensure_secondary(route.device_index) if ctx is None or ctx["facade"].renderer is None: # No secondary renderer -> defer to render_offloaded's clear raise so the # caller surfaces the precise rig-completion message rather than silently # dropping the SRU. self.render_offloaded(route) return False facade = ctx["facade"] residency = ctx["residency"] width, height = int(sru.width), int(sru.height) # 3. Mirror geometry the SRU draws onto the secondary device. Vertex-colour # SRUs need only meshes; textured-SRU residency mirrors sampled textures too # (FLAGGED: needs source pixels, see SecondaryResidency.ensure_textures). The # source CPU arrays come from the primary registry's retained geometry (the # engine enables retention when multi-GPU is active); a VkBuffer cannot cross # devices, only the arrays can. residency.ensure_meshes(self._sru_mesh_payloads(sru, self._primary_mesh_registry())) # 4. Secondary offscreen target sized to the SRU (recreate on resize). target = ctx["target"] if target is None or (target.width, target.height) != (width, height): from ..renderer.render_target import RenderTarget if target is not None: target.destroy() target = RenderTarget( facade.ctx.device, facade.ctx.physical_device, width, height, colour_format=vk.VK_FORMAT_R16G16B16A16_SFLOAT, use_depth=True, samplable_depth=True, queue=facade.ctx.graphics_queue, command_pool=facade.ctx.command_pool, ) ctx["target"] = target ctx["transfer"] = None # size changed -> staging must be re-sized # 5. Record + submit the SRU on the secondary renderer into ``target``. The # injected secondary-renderer factory returns an object exposing # ``render_sru_offscreen(sru, target)`` (duck-typed): on the rig it wraps a # ``Renderer(facade)`` and mirrors the primary ``render_sru_from_plan`` slice # model on its OWN device (reserve a transform-SSBO slice, upload the SRU # transforms, record ``render_scene_content`` into ``target``, submit on the # secondary graphics queue + wait). The core shared ``Renderer`` is NOT made # to grow this method, so the single-GPU path stays byte-identical. recorder = facade.renderer # Bind residency (for primary->secondary MeshHandle remap) + mirror the # primary material/light SSBO contents so the offloaded draws shade # identically (set once per build is insufficient: materials change per # frame, so mirror every frame from the primary renderer). if hasattr(recorder, "set_residency"): recorder.set_residency(residency) primary_renderer = self._manager.primary.renderer if primary_renderer is not None and hasattr(recorder, "set_materials"): recorder.set_materials(getattr(primary_renderer, "_materials", None)) recorder.set_lights(getattr(primary_renderer, "_lights", None)) recorder.render_sru_offscreen(sru, target) # 6. Transfer the secondary colour image into the primary bindless image. transfer = ctx["transfer"] if transfer is None: transfer = self.transfer_for(route, width=width, height=height) transfer.bytes_per_pixel = 8 # R16G16B16A16_SFLOAT ctx["transfer"] = transfer transfer.ensure_staging(facade.ctx.command_pool, self._primary_pool()) transfer.run(src_image=target.colour_image, dst_image=primary_dst_image) return True
def _primary_pool(self) -> Any: """The primary device's command pool (for the destination half of the copy).""" primary = self._manager.primary renderer = primary.renderer # The primary renderer's engine exposes ctx.command_pool; resolve via the # attached renderer's engine when present, else the slot has no pool handle # and the rig wiring supplies it. Kept as a single resolution seam. eng = getattr(renderer, "_engine", None) ctx = getattr(eng, "ctx", None) if ctx is not None: return ctx.command_pool raise RuntimeError( "primary command pool unavailable for cross-device transfer; attach the " "primary renderer to slot 0 (MultiDeviceManager.attach_renderer(0, renderer))." ) def _primary_mesh_registry(self) -> Any: """The primary renderer's :class:`MeshRegistry` (source of retained geometry). Resolved off the primary slot's attached renderer engine (``renderer._engine.mesh_registry``). The engine enables geometry retention on that registry when multi-GPU is active, so :meth:`get_geometry` returns the CPU arrays an offloaded SRU's meshes need to be mirrored onto a secondary device. ``None`` when no primary renderer is attached (the residency mirror then skips with a warning rather than indexing the wrong device's buffers). """ renderer = self._manager.primary.renderer eng = getattr(renderer, "_engine", None) return getattr(eng, "mesh_registry", None) @staticmethod def _sru_mesh_payloads(sru: Any, primary_registry: Any) -> list[tuple[int, Any, Any]]: """Extract ``(residency_key, vertices, indices)`` payloads from an SRU's instances. ``residency_key`` is ``id(mh)`` (the per-instance handle identity the secondary recorder remaps against in :meth:`SecondarySRURenderer._remap_instances`), and the CPU ``vertices``/``indices`` are fetched from the primary :class:`~simvx.graphics.renderer.mesh_registry.MeshRegistry` by the handle's registry ``id`` (the device-independent source arrays the registry retained when multi-GPU is active). A handle whose geometry is not retained (no ``primary_registry`` or a registry without retention) is skipped, and the recorder drops that instance with a warning rather than indexing the wrong device's buffers. Dedup by ``id(mh)`` so a shared mesh mirrors once per device. """ if primary_registry is None or not hasattr(primary_registry, "get_geometry"): return [] payloads: list[tuple[int, Any, Any]] = [] seen: set[int] = set() for entry in list(getattr(sru, "instances", [])) + list(getattr(sru, "skinned_instances", [])): mh = entry[0] key = id(mh) if key in seen: continue seen.add(key) geom = primary_registry.get_geometry(getattr(mh, "id", None)) if geom is not None: payloads.append((key, geom[0], geom[1])) return payloads
[docs] def destroy(self) -> None: """Tear down all lazily-built secondary render contexts (facade + target + transfer).""" for ctx in self._secondary.values(): transfer = ctx.get("transfer") if transfer is not None: transfer.destroy() target = ctx.get("target") if target is not None: target.destroy() facade = ctx.get("facade") if facade is not None: facade.destroy() self._secondary.clear()