"""Explicit multi-adapter (multi-GPU) foundation: D8 workload-split offload.
This is the **foundation** wave of design decision D8. It builds the gated
plumbing for an explicit-multi-adapter renderer (one independent ``VkDevice``
per physical GPU) where whole SubViewport / offscreen scene-render-units (SRUs)
are offloaded to secondary GPUs and composited on the primary (GPU 0). It is
deliberately **off by default**: a single-GPU box, and any multi-GPU box that
does not opt in, runs today's single-device path byte-identical (no extra
device, no transfer work, no behaviour change). The active multi-device path is
verified on a 4x Arc Pro B70 rig, not on this single-GPU dev box.
Three coherent, unit-testable pieces live here:
1. :class:`MultiDeviceManager` enumerates physical devices and, only when
``physical_device_count > 1`` **and** the caller opted in, creates an
independent logical device per physical GPU (reusing
:func:`~simvx.graphics.gpu.device.create_logical_device`). When the count is
1 or the opt-in is off it holds exactly the existing single (primary) device
and is a transparent passthrough.
2. :func:`assign_srus` is the pure device-assignment policy (no Vulkan): given
the ordered SRUs and a device count it decides which render on GPU 0 vs which
offload to GPU 1+. TAA-safe by construction (each SRU keeps its temporal
history on its one assigned device, so there is no cross-device reprojection).
3. :class:`CrossDeviceTransfer` selects how a finished offscreen colour image
moves from a secondary device to the primary for compositing. The
staging-copy path (secondary -> host-visible staging -> primary) is the
guaranteed floor and is the only path implemented end-to-end; the dma-buf /
``VK_KHR_external_memory_fd`` zero-copy path is the rig optimisation, gated
behind a capability and currently raising a clear, actionable error.
REALITY CHECK (honest scope). Rendering an SRU on a *second* ``VkDevice``
requires that device to own its full set of rendering resources: its own
pipelines, descriptor pools, transform/material SSBOs, and mesh+texture
residency. The current :class:`~simvx.graphics.renderer.forward.Renderer` is
built around exactly one device. Duplicating it per device is a large,
GPU-bound refactor that cannot be functionally verified on this single-GPU box.
So this module stops at a clean seam: the device manager, the assignment
policy, and the transfer interface are real and tested; the per-device renderer
construction + the actual offload-record-and-composite loop are the documented
rig-side completion (see :class:`MultiDeviceManager.attach_renderer` /
:meth:`DeviceSlot.renderer`).
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any
import vulkan as vk
from .device import QueueFamilies, create_logical_device
if TYPE_CHECKING:
from .capabilities import RenderCapabilities
log = logging.getLogger(__name__)
__all__ = [
"DeviceSlot",
"MultiDeviceManager",
"SRUAssignment",
"assign_srus",
"CrossDeviceTransfer",
"TransferMethod",
"OffloadRoute",
"SRUOffloadCoordinator",
]
# --- Device topology ---------------------------------------------------------
[docs]
@dataclass
class DeviceSlot:
"""One physical+logical GPU participating in the multi-device renderer.
``index`` 0 is always the primary (compositing) GPU; 1+ are secondaries that
render offloaded SRUs. On a single-GPU run there is exactly one slot
(``index == 0``) wrapping the engine's existing device, and nothing else is
created.
``renderer`` is the per-device :class:`~simvx.graphics.renderer.forward.Renderer`
duplicate. It is populated for the primary slot from the engine's existing
renderer and, on the rig, for each secondary by
:meth:`MultiDeviceManager.attach_renderer`. It stays ``None`` for secondaries
until that rig-side per-device renderer construction lands (see the module
docstring): the foundation never fabricates a broken renderer to look
complete.
"""
index: int
physical_device: Any
queue_families: QueueFamilies
device: Any = None
graphics_queue: Any = None
present_queue: Any = None
compute_queue: Any = None
transfer_queue: Any = None
name: str = ""
renderer: Any = None
[docs]
@property
def is_primary(self) -> bool:
return self.index == 0
[docs]
class MultiDeviceManager:
"""Owns the per-physical-GPU logical devices for the D8 offload renderer.
Construct with the primary device's already-created handles (the engine's
existing single device, so the primary slot is never re-created) plus the
enumerated physical devices and the opt-in flag. When ``enabled`` is true and
more than one physical device is present, a secondary :class:`DeviceSlot` is
created per additional physical GPU with its own independent ``VkDevice`` via
:func:`create_logical_device`. Otherwise the manager holds exactly the single
primary slot and :attr:`multi_gpu` is ``False`` (today's path, unchanged).
The manager does NOT own the primary device's lifetime (the engine created
and destroys it); :meth:`destroy` only tears down the *secondary* devices it
created itself.
"""
def __init__(
self,
*,
primary_physical_device: Any,
primary_queue_families: QueueFamilies,
primary_device: Any,
primary_graphics_queue: Any,
primary_present_queue: Any,
physical_devices: list[Any],
enabled: bool,
capabilities: RenderCapabilities | None = None,
primary_compute_queue: Any = None,
primary_transfer_queue: Any = None,
find_queue_families: Any = None,
) -> None:
self._capabilities = capabilities
self._physical_devices = list(physical_devices)
# Whether secondary logical devices were created with
# VK_KHR_external_memory_fd enabled. Only true on an active multi-GPU path
# where the extension is probed available on every secondary; gates the
# dma-buf transfer selection (enabled, not merely probed). False on this
# box (no secondaries) and whenever any secondary lacks the extension.
self.external_memory_fd_enabled = False
# Teardown callbacks for per-secondary-device GPU resources (offload
# coordinator facades / targets / staging buffers). They MUST run before
# the secondary VkDevices are destroyed; :meth:`destroy` invokes them first.
self._teardown_hooks: list[Any] = []
# The primary slot wraps the engine's existing device. It is slot 0 and
# the manager never creates or destroys its VkDevice.
primary = DeviceSlot(
index=0,
physical_device=primary_physical_device,
queue_families=primary_queue_families,
device=primary_device,
graphics_queue=primary_graphics_queue,
present_queue=primary_present_queue,
compute_queue=primary_compute_queue,
transfer_queue=primary_transfer_queue,
name=_device_name(primary_physical_device),
)
self._slots: list[DeviceSlot] = [primary]
# Multi-GPU is active only when the caller opted in AND there is more
# than one physical device. Either condition false => single-slot
# passthrough, byte-identical to today.
self._multi_gpu = bool(enabled) and len(self._physical_devices) > 1
if self._multi_gpu:
self._create_secondaries(primary_physical_device, find_queue_families)
elif enabled and len(self._physical_devices) <= 1:
log.info(
"multi-GPU requested but only %d physical device(s): single-device path",
len(self._physical_devices),
)
def _create_secondaries(self, primary_pd: Any, find_queue_families: Any) -> None:
"""Create one independent logical device per *additional* physical GPU.
``find_queue_families`` resolves a :class:`QueueFamilies` for a secondary
physical device. A secondary renders offscreen (it never presents), so it
does not need a present-capable family; the injected resolver lets the
rig wire surface-less graphics-queue selection without this module
importing the surface path. When omitted (unit tests, and any device for
which it returns ``None``) the secondary is skipped with a warning rather
than guessing a family.
"""
# Enable the dma-buf prerequisites on every secondary only when the
# capability snapshot reports the extension available; otherwise keep the
# staging-copy floor. The flag is set true only once at least one secondary
# was actually created with it enabled.
want_ext_mem = bool(getattr(self._capabilities, "external_memory_fd", False))
for pd in self._physical_devices:
if pd is primary_pd:
continue
qf = find_queue_families(pd) if find_queue_families is not None else None
if qf is None:
log.warning("multi-GPU: no graphics queue family resolved for %s, skipping", _device_name(pd))
continue
device, gq, pq, cq, tq = create_logical_device(pd, qf, external_memory_fd=want_ext_mem)
if want_ext_mem:
self.external_memory_fd_enabled = True
self._slots.append(
DeviceSlot(
index=len(self._slots),
physical_device=pd,
queue_families=qf,
device=device,
graphics_queue=gq,
present_queue=pq,
compute_queue=cq,
transfer_queue=tq,
name=_device_name(pd),
)
)
log.info("multi-GPU: created secondary logical device %d (%s)", len(self._slots) - 1, _device_name(pd))
[docs]
@property
def multi_gpu(self) -> bool:
"""True only when an opted-in multi-device renderer is active (>= 2 slots)."""
return self._multi_gpu and len(self._slots) > 1
[docs]
@property
def device_count(self) -> int:
"""Number of logical devices the manager owns (1 on the single-GPU path)."""
return len(self._slots)
[docs]
@property
def primary(self) -> DeviceSlot:
return self._slots[0]
[docs]
@property
def secondaries(self) -> list[DeviceSlot]:
return self._slots[1:]
[docs]
@property
def slots(self) -> list[DeviceSlot]:
return list(self._slots)
[docs]
def slot(self, index: int) -> DeviceSlot:
return self._slots[index]
[docs]
def attach_renderer(self, index: int, renderer: Any) -> None:
"""Bind a per-device :class:`Renderer` to slot ``index`` (rig-side).
The primary renderer is the engine's existing one. Each secondary needs
its OWN renderer (its device's pipelines / descriptor pools / SSBOs /
residency); constructing that is the documented rig-side completion. This
setter is the seam where the rig hands the constructed per-device
renderer back to the manager so the offload loop can record into it.
"""
self._slots[index].renderer = renderer
[docs]
def register_teardown(self, hook: Any) -> None:
"""Register a callback to free per-secondary GPU resources before device destroy.
The offload coordinator registers its :meth:`SRUOffloadCoordinator.destroy`
here so its secondary facades / targets / staging buffers are freed while the
secondary ``VkDevice``\\ s are still alive (they own those resources). Invoked
first by :meth:`destroy`.
"""
self._teardown_hooks.append(hook)
[docs]
def destroy(self) -> None:
"""Destroy only the SECONDARY logical devices this manager created.
The primary device is owned by the engine and left untouched. Safe to
call on the single-GPU path (no secondaries => no-op). Registered teardown
hooks run FIRST so per-secondary GPU resources are freed before the devices.
"""
for hook in self._teardown_hooks:
hook()
self._teardown_hooks.clear()
for slot in self._slots[1:]:
if slot.device is not None:
vk.vkDeviceWaitIdle(slot.device)
vk.vkDestroyDevice(slot.device, None)
slot.device = None
del self._slots[1:]
self._multi_gpu = False
def _barrier_image(
cmd: Any,
image: Any,
old_layout: int,
new_layout: int,
src_access: int,
dst_access: int,
src_stage: int,
dst_stage: int,
) -> None:
"""Record a single colour-image layout barrier into ``cmd`` (no submit).
Inline barrier for the cross-device staging copy: unlike
:func:`~simvx.graphics.gpu.memory.transition_image_layout` it records into a
caller-owned command buffer rather than submitting its own one-shot, so the
secondary->buffer and buffer->primary copies each batch their two barriers +
the copy into ONE submit. Colour aspect, single mip / array layer.
"""
barrier = vk.VkImageMemoryBarrier(
srcAccessMask=src_access,
dstAccessMask=dst_access,
oldLayout=old_layout,
newLayout=new_layout,
srcQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
dstQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
image=image,
subresourceRange=vk.VkImageSubresourceRange(
aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1,
),
)
vk.vkCmdPipelineBarrier(cmd, src_stage, dst_stage, 0, 0, None, 0, None, 1, [barrier])
def _device_name(physical_device: Any) -> str:
try:
props = vk.vkGetPhysicalDeviceProperties(physical_device)
name = props.deviceName
return name if isinstance(name, str) else name.decode("utf-8")
except Exception: # noqa: BLE001 (name is diagnostic only)
return "<unknown>"
# --- Device-assignment policy (pure logic, no Vulkan) ------------------------
[docs]
@dataclass(frozen=True, slots=True)
class SRUAssignment:
"""Which device renders one SRU and whether its result must be transferred.
``device_index`` 0 means the SRU renders on the primary and is already
resident for compositing (``needs_transfer`` is ``False``). A non-zero index
means the SRU is offloaded to that secondary and its finished colour image
must be moved to the primary before compositing (``needs_transfer`` is
``True``).
"""
sru_id: int
device_index: int
[docs]
@property
def needs_transfer(self) -> bool:
return self.device_index != 0
def _sru_cost(sru: Any) -> int:
"""Heuristic offload cost of one SRU: its instance + skinned-instance count.
Pure and Vulkan-free. The heaviest independent SRUs are the ones worth
shipping to a secondary GPU; instance count is the cheapest faithful proxy
we already have on the snapshotted plan. SRUs with no countable instances
fall back to cost 1 so ordering is still deterministic.
"""
inst = getattr(sru, "instances", None)
skinned = getattr(sru, "skinned_instances", None)
n = (len(inst) if inst is not None else 0) + (len(skinned) if skinned is not None else 0)
return n if n > 0 else 1
[docs]
def assign_srus(
srus: list[Any],
device_count: int,
*,
sru_id: Any = None,
cost: Any = None,
) -> list[SRUAssignment]:
"""Decide which device renders each SRU (pure, unit-testable, no Vulkan).
Policy (TAA-safe by construction: an SRU is assigned to exactly one device,
so its temporal history never crosses devices):
- ``device_count <= 1``: EVERY SRU stays on GPU 0. This is the single-GPU /
unopted path and produces the same per-SRU work as today, byte-identical
(no transfer is ever flagged).
- ``device_count >= 2``: the main scene is implicitly GPU 0 (it is not an
SRU and is not in this list). Among the SRUs, the cheapest stay on GPU 0
(composited locally) and the **heaviest independent** SRUs are offloaded to
the secondaries round-robin. Concretely: sort SRUs by descending cost and
walk them, sending the next heaviest to the least-loaded secondary while
that keeps the primary from being the bottleneck; ties and the remainder
stay on GPU 0.
The returned list preserves the INPUT order of ``srus`` (the P1
producer-before-consumer topological order), so a consumer SRU still follows
the producer it samples; only the device choice is decided here.
Args:
srus: Ordered SRU plans (``SubViewportSRU`` or any object exposing the
cost inputs). Order is preserved in the result.
device_count: Number of devices available (``MultiDeviceManager.device_count``).
sru_id: Optional accessor ``sru -> int`` for the stable id; defaults to
reading ``sru.sru_id``.
cost: Optional accessor ``sru -> int`` overriding the default instance-count
heuristic (handy for tests).
"""
get_id = sru_id if sru_id is not None else (lambda s: s.sru_id)
get_cost = cost if cost is not None else _sru_cost
if device_count <= 1 or not srus:
return [SRUAssignment(sru_id=get_id(s), device_index=0) for s in srus]
n_secondary = device_count - 1
# Heaviest-first candidate order for offload decisions, but we DECIDE on this
# order and then EMIT in the caller's input order so producer<-consumer
# topology survives. Stable secondary index map keyed by id().
by_cost = sorted(srus, key=lambda s: (-get_cost(s), id(s)))
# Round-robin the heaviest SRUs across secondaries; keep the rest on primary.
# We offload at most ``n_secondary`` heaviest SRUs per "round" so the primary
# always keeps the cheap tail (it must also composite). A simple, balanced
# rule that scales to device_count in {2, 4}: offload the top
# ``n_secondary`` heaviest SRUs, one per secondary; everything else -> GPU 0.
offload: dict[int, int] = {}
for slot_offset, sru in enumerate(by_cost[:n_secondary]):
offload[id(sru)] = 1 + slot_offset # secondary device indices 1..n
return [
SRUAssignment(sru_id=get_id(s), device_index=offload.get(id(s), 0))
for s in srus
]
# --- Cross-device transfer interface -----------------------------------------
[docs]
class TransferMethod:
"""Selectable cross-device image-transfer strategies (enum-like constants)."""
NONE = "none"
"""SRU already lives on the primary device: compositing samples it directly."""
STAGING_COPY = "staging_copy"
"""Secondary image -> host-visible staging buffer -> primary image. The
guaranteed floor, works on any pair of devices (the colour RenderTarget
already carries TRANSFER_SRC|TRANSFER_DST)."""
DMABUF = "dmabuf"
"""Zero-copy via VK_KHR_external_memory_fd (dma-buf import/export). The rig
optimisation; requires the external-memory extensions to be enabled on both
devices. Gated and currently raises until the rig path is implemented."""
# The dma-buf zero-copy transfer (run_dmabuf) is not implemented yet, so
# select_transfer_method must never return DMABUF regardless of caller preference
# or the enabled-extension capability: doing so would later raise with no working
# path. Flip to True only when run_staging_copy's dma-buf sibling is wired + rig-verified.
_DMABUF_IMPLEMENTED = False
[docs]
def select_transfer_method(
assignment: SRUAssignment,
capabilities: RenderCapabilities | None,
*,
prefer_dmabuf: bool = True,
) -> str:
"""Pick the transfer strategy for one SRU assignment (pure, no Vulkan).
- An SRU on the primary needs no transfer -> :data:`TransferMethod.NONE`.
- An offloaded SRU uses :data:`TransferMethod.DMABUF` when the caller prefers
it AND the capability snapshot reports the external-memory-fd path *enabled*;
otherwise the always-available :data:`TransferMethod.STAGING_COPY`.
The capability gate is :attr:`RenderCapabilities.external_memory_fd_enabled`:
the extension must be *enabled* at device creation, not merely probed
available. A probed-but-not-enabled device cannot export/import fds, so DMABUF
must not be selected for it (that would later raise with no working path). On
this dev box the single-GPU path never enables it, so the field is ``False``,
the staging-copy floor is always chosen, and the dma-buf raise is never reached.
"""
if not assignment.needs_transfer:
return TransferMethod.NONE
has_dmabuf = bool(getattr(capabilities, "external_memory_fd_enabled", False))
# DMABUF is only selectable once its transfer path actually exists (it does
# not yet, run_dmabuf raises), so enabling the extension at device creation
# must NOT activate an unimplemented path: staging-copy stays the choice.
if _DMABUF_IMPLEMENTED and prefer_dmabuf and has_dmabuf:
return TransferMethod.DMABUF
return TransferMethod.STAGING_COPY
[docs]
@dataclass
class CrossDeviceTransfer:
"""Moves a finished offscreen colour image from a secondary to the primary.
Holds the source (secondary) and destination (primary) :class:`DeviceSlot`
plus the chosen :class:`TransferMethod`. Cross-device synchronisation is
explicit: each device signals a fence/timeline when its half of the copy is
done, and the primary's composite waits on the import being complete. The
sync handles are carried on this object so the rig wiring is a single seam.
Implemented:
:meth:`run_staging_copy` defines the staging-copy sequence
(secondary GPU copy-to-buffer -> host-visible staging -> primary
copy-to-image). The CPU-roundtrip floor: correct on any device pair, slower
than dma-buf. The actual ``vkCmd*`` recording is the rig-side completion
because it needs both devices' live command pools + a host-visible staging
allocation per device, which cannot be exercised on this single-GPU box;
the method documents and gates that precisely rather than emitting
unverifiable copy code.
Gated (rig):
:meth:`run_dmabuf` raises a clear, actionable error until
VK_KHR_external_memory_fd is enabled and the import/export is wired on the
rig.
"""
src: DeviceSlot
dst: DeviceSlot
method: str
# Cross-device sync handles (per-device fence/timeline). Defined as the seam;
# populated by the rig wiring. The primary composite waits on ``dst_ready``.
src_done: Any = None
dst_ready: Any = None
# Sizing of the transferred colour image (set when the SRU target is known).
width: int = 0
height: int = 0
# Bytes per pixel of the colour format being moved. The SubViewport offscreen
# target is ``R16G16B16A16_SFLOAT`` (8 bpp); a generic RT may differ. The host
# roundtrip is format-agnostic: it moves raw bytes, so only the byte count of
# one row + the whole image matter, not the channel layout.
bytes_per_pixel: int = 8
extra: dict = field(default_factory=dict)
[docs]
def run(self, *, src_image: Any = None, dst_image: Any = None) -> None:
"""Execute the selected transfer. Dispatches by :attr:`method`.
:data:`TransferMethod.NONE` is a no-op (single-GPU / primary-resident SRU,
the byte-identical path). The other two dispatch to their handlers.
"""
if self.method == TransferMethod.NONE:
return
if self.method == TransferMethod.STAGING_COPY:
self.run_staging_copy(src_image=src_image, dst_image=dst_image)
return
if self.method == TransferMethod.DMABUF:
self.run_dmabuf(src_image=src_image, dst_image=dst_image)
return
raise ValueError(f"unknown cross-device transfer method {self.method!r}")
[docs]
def run_staging_copy(self, *, src_image: Any = None, dst_image: Any = None) -> None:
"""Staging-copy floor: secondary image -> host staging -> primary image.
The CPU-roundtrip floor: correct on any device pair, slower than dma-buf.
``src_image`` is the finished SRU colour image on the SECONDARY device (left
in ``SHADER_READ_ONLY_OPTIMAL`` by the offscreen pass; the RenderTarget
carries ``TRANSFER_SRC_BIT``). ``dst_image`` is the primary-device image the
main scene samples for this SubViewport feed (carries ``TRANSFER_DST_BIT``).
Exact sequence + every wait (the contract recorded here, verified on the
rig). All staging buffers are host-visible|host-coherent and pre-built once
per (transfer, size) into :attr:`extra` by :meth:`ensure_staging`:
1. SOURCE (secondary) device, one-shot command buffer on the secondary
command pool:
a. barrier ``src_image``: ``SHADER_READ_ONLY_OPTIMAL -> TRANSFER_SRC_OPTIMAL``
(src stage FRAGMENT_SHADER, dst stage TRANSFER; src access SHADER_READ,
dst access TRANSFER_READ).
b. ``vkCmdCopyImageToBuffer`` ``src_image -> src_staging`` (tightly packed,
``bufferRowLength=0``, ``bufferImageHeight=0``).
c. barrier ``src_image``: ``TRANSFER_SRC_OPTIMAL -> SHADER_READ_ONLY_OPTIMAL``
so the secondary can render into / sample it again next frame.
d. submit on ``src.graphics_queue`` signalling :attr:`src_done` (a fence
on the secondary device). **WAIT:** the host read in step 2 blocks on
``src_done`` (``vkWaitForFences``); the GPU copy must complete before
the bytes are mapped. The offscreen SRU render fence is itself waited
on before this submit by the caller (``_record_offloaded_sru``), so the
colour image is fully written first.
2. HOST: map ``src_staging`` (secondary device), ``memmove`` the
``height * row_bytes`` bytes into ``dst_staging`` (primary device), unmap
both. Both are host-coherent so no explicit flush/invalidate is needed.
**WAIT:** gated on ``src_done`` from 1d before the read; the write into
``dst_staging`` happens-before the primary GPU read in 3 because step 3 is
submitted only after this memmove returns on the same (host) thread.
3. DESTINATION (primary) device, one-shot command buffer on the primary
command pool:
a. barrier ``dst_image``: ``SHADER_READ_ONLY_OPTIMAL -> TRANSFER_DST_OPTIMAL``.
b. ``vkCmdCopyBufferToImage`` ``dst_staging -> dst_image``.
c. barrier ``dst_image``: ``TRANSFER_DST_OPTIMAL -> SHADER_READ_ONLY_OPTIMAL``
so the main composite pass samples it.
d. submit on ``dst.graphics_queue`` signalling :attr:`dst_ready` (a fence
on the primary device). **WAIT:** the main composite pass that samples
``dst_image`` runs after ``dst_ready``; the caller waits on it before
recording the frame's main pass (or, in the same-frame in-cmd model,
this transfer is submitted + waited before the main pass is recorded).
The implementation records exactly this. It needs both devices' command
pools + a host-visible staging buffer per device, supplied via
:meth:`ensure_staging`. On the single-GPU path the method is NONE and this is
never reached; it is exercised end-to-end only on the multi-GPU rig.
"""
if self.method != TransferMethod.STAGING_COPY:
raise ValueError(f"run_staging_copy called for method {self.method!r}")
if src_image is None or dst_image is None:
raise ValueError("run_staging_copy needs both src_image (secondary) and dst_image (primary)")
staging = self.extra.get("staging")
if staging is None:
raise RuntimeError(
"run_staging_copy: per-device staging buffers not allocated; call "
"ensure_staging(src_pool, dst_pool) first (the offload coordinator does)."
)
self._record_staging_copy(src_image, dst_image, staging)
[docs]
def ensure_staging(self, src_command_pool: Any, dst_command_pool: Any) -> dict:
"""Allocate (once) the per-device host-visible staging buffers + cache pools.
Builds one host-visible|host-coherent buffer on each device, sized
``width * height * bytes_per_pixel`` (the raw colour image bytes). Cached on
:attr:`extra` ``['staging']`` so a steady-state per-frame transfer reuses
them; reallocated only when the size changes (a SubViewport resize). Returns
the staging dict. Rig-side GPU allocation; never reached on this box.
"""
from .memory import create_buffer
size = max(1, self.width * self.height * self.bytes_per_pixel)
staging = self.extra.get("staging")
if staging is not None and staging.get("size") == size:
staging["src_pool"] = src_command_pool
staging["dst_pool"] = dst_command_pool
return staging
if staging is not None:
self._destroy_staging(staging)
src_buf, src_mem = create_buffer(
self.src.device, self.src.physical_device, size,
vk.VK_BUFFER_USAGE_TRANSFER_DST_BIT,
vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
)
dst_buf, dst_mem = create_buffer(
self.dst.device, self.dst.physical_device, size,
vk.VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
)
staging = {
"size": size,
"src_buf": src_buf, "src_mem": src_mem,
"dst_buf": dst_buf, "dst_mem": dst_mem,
"src_pool": src_command_pool, "dst_pool": dst_command_pool,
}
self.extra["staging"] = staging
return staging
def _record_staging_copy(self, src_image: Any, dst_image: Any, staging: dict) -> None:
"""Record + run the two single-submit copies + the host roundtrip (see sequence)."""
from .memory import begin_single_time_commands, end_single_time_commands
w, h, bpp = self.width, self.height, self.bytes_per_pixel
row_bytes = w * bpp
size = staging["size"]
# --- 1. SOURCE (secondary): image -> host staging buffer -----------------
cmd = begin_single_time_commands(self.src.device, staging["src_pool"])
_barrier_image(
cmd, src_image,
vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_READ_BIT,
vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
)
region = vk.VkBufferImageCopy(
bufferOffset=0, bufferRowLength=0, bufferImageHeight=0,
imageSubresource=vk.VkImageSubresourceLayers(
aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0, baseArrayLayer=0, layerCount=1,
),
imageOffset=vk.VkOffset3D(x=0, y=0, z=0),
imageExtent=vk.VkExtent3D(width=w, height=h, depth=1),
)
vk.vkCmdCopyImageToBuffer(
cmd, src_image, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, staging["src_buf"], 1, [region],
)
_barrier_image(
cmd, src_image,
vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
vk.VK_ACCESS_TRANSFER_READ_BIT, vk.VK_ACCESS_SHADER_READ_BIT,
vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
)
# end_single_time_commands submits + waits idle on the secondary queue: that
# is the src_done fence-equivalent (the host read below cannot start until
# the secondary GPU copy has fully completed).
end_single_time_commands(self.src.device, self.src.graphics_queue, staging["src_pool"], cmd)
# --- 2. HOST: secondary staging -> primary staging -----------------------
ffi = vk.ffi
src_ptr = vk.vkMapMemory(self.src.device, staging["src_mem"], 0, size, 0)
dst_ptr = vk.vkMapMemory(self.dst.device, staging["dst_mem"], 0, size, 0)
ffi.memmove(dst_ptr, src_ptr, h * row_bytes)
vk.vkUnmapMemory(self.src.device, staging["src_mem"])
vk.vkUnmapMemory(self.dst.device, staging["dst_mem"])
# --- 3. DESTINATION (primary): host staging -> primary image -------------
cmd = begin_single_time_commands(self.dst.device, staging["dst_pool"])
_barrier_image(
cmd, dst_image,
vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_WRITE_BIT,
vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT,
)
vk.vkCmdCopyBufferToImage(
cmd, staging["dst_buf"], dst_image, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, [region],
)
_barrier_image(
cmd, dst_image,
vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
vk.VK_ACCESS_TRANSFER_WRITE_BIT, vk.VK_ACCESS_SHADER_READ_BIT,
vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
)
# dst_ready fence-equivalent: waits idle on the primary queue, so the main
# composite pass that samples dst_image runs strictly after the copy lands.
end_single_time_commands(self.dst.device, self.dst.graphics_queue, staging["dst_pool"], cmd)
def _destroy_staging(self, staging: dict) -> None:
vk.vkDestroyBuffer(self.src.device, staging["src_buf"], None)
vk.vkFreeMemory(self.src.device, staging["src_mem"], None)
vk.vkDestroyBuffer(self.dst.device, staging["dst_buf"], None)
vk.vkFreeMemory(self.dst.device, staging["dst_mem"], None)
[docs]
def destroy(self) -> None:
"""Free the per-device staging buffers this transfer allocated (rig-side)."""
staging = self.extra.pop("staging", None)
if staging is not None:
self._destroy_staging(staging)
[docs]
def run_dmabuf(self, *, src_image: Any = None, dst_image: Any = None) -> None:
"""Zero-copy dma-buf transfer (VK_KHR_external_memory_fd). Rig optimisation.
Export the secondary colour image's memory as an opaque fd / dma-buf,
import it on the primary as an external image, and composite directly with
a single cross-device semaphore wait (no host roundtrip). Requires the
external-memory-fd extension enabled on BOTH devices at init.
Gated: raises until the rig path is implemented and the
``external_memory_fd`` capability is enabled.
"""
raise NotImplementedError(
"dma-buf zero-copy transfer (VK_KHR_external_memory_fd) is the rig "
"optimisation: enable the external-memory-fd extension on both devices "
"at init, export the secondary colour image memory as an fd, import it "
"on the primary, and composite with a cross-device semaphore. Falls back "
"to TransferMethod.STAGING_COPY when the capability is off (this box). "
"Verified on the 4x Arc Pro B70 rig."
)
# --- SRU offload routing (the SubViewport offload seam) ----------------------
[docs]
@dataclass(frozen=True, slots=True)
class OffloadRoute:
"""The per-SRU offload decision the recording path consults.
Computed once per frame by :class:`SRUOffloadCoordinator.plan` from the
ordered SRU list. ``device_index`` 0 means "render this SRU on the primary,
exactly as today" (``offloaded`` is ``False``, ``transfer`` is
:data:`TransferMethod.NONE`); a non-zero index means the SRU renders on that
secondary's renderer and its colour image is moved to the primary via
:attr:`transfer` before the main pass samples it.
"""
sru_id: int
device_index: int
transfer: str
[docs]
@property
def offloaded(self) -> bool:
"""True when this SRU renders on a secondary device (not GPU 0)."""
return self.device_index != 0
[docs]
class SRUOffloadCoordinator:
"""Decides + drives where each SubViewport SRU renders across the devices.
Glue between the (already-tested) :func:`assign_srus` policy, the
:func:`select_transfer_method` selector, and the :class:`MultiDeviceManager`
device topology. The recording path (``SceneAdapter.render_sru_from_plan`` /
``render_to_target``) consults a coordinator, when one is present, to decide
per SRU whether to take today's primary-device path or route the SRU to a
secondary device and transfer the result back.
Constructed only when the manager is actively multi-GPU (opted in AND >= 2
devices). On the single-GPU / unopted path the engine builds **no**
coordinator, so the recording path's ``coordinator is None`` branch is taken
and the frame is byte-identical to today.
The decision logic (``plan`` / ``route_for``) is pure and unit-tested with no
Vulkan. :meth:`render_offloaded` is the seam where a secondary-assigned SRU
would be recorded on its device's renderer and transferred back; that needs a
per-device renderer (``DeviceSlot.renderer``), which is ``None`` on every box
without the rig-side per-device-renderer construction, so it raises a clear,
capability-gated error rather than emitting unverifiable cross-device code.
"""
def __init__(self, manager: MultiDeviceManager, capabilities: RenderCapabilities | None = None,
*, prefer_dmabuf: bool = True, content_scale: tuple[float, float] = (1.0, 1.0),
secondary_renderer_factory: Any = None) -> None:
self._manager = manager
self._capabilities = capabilities
self._prefer_dmabuf = prefer_dmabuf
self._content_scale = content_scale
self._routes: dict[int, OffloadRoute] = {}
# Rig-injected builder: ``factory(facade) -> Renderer`` constructs a
# ``Renderer(facade)`` and runs its GPU ``setup()`` on the secondary device.
# The engine injects it (on the rig) so this module never imports the heavy
# renderer / triggers a GPU build it cannot verify on this single-GPU box.
# ``None`` => no secondary renderer can be built and :meth:`render_offloaded`
# raises the clear rig-completion error (the path here today).
self._secondary_renderer_factory = secondary_renderer_factory
# Per-secondary-device lazily-built render state, keyed by device_index:
# {index: {"facade": SecondaryRenderContext, "residency": SecondaryResidency,
# "target": RenderTarget, "transfer": CrossDeviceTransfer}}
# Built once, reused every frame; torn down in :meth:`destroy`.
self._secondary: dict[int, dict] = {}
# Free per-secondary GPU resources before the manager destroys the secondary
# devices (they own those resources). No-op on this box (nothing is built).
manager.register_teardown(self.destroy)
[docs]
@property
def active(self) -> bool:
"""True only when the backing manager is an opted-in multi-device renderer."""
return self._manager.multi_gpu
[docs]
def plan(self, srus: list[Any], *, sru_id: Any = None, cost: Any = None) -> list[OffloadRoute]:
"""Compute + cache this frame's per-SRU routes from the ordered SRU list.
Returns one :class:`OffloadRoute` per SRU in INPUT order (the P1
producer-before-consumer topological order is preserved). When the
coordinator is inactive (single-GPU / unopted) every route stays on GPU 0
with :data:`TransferMethod.NONE`, so the caller takes today's path.
``sru_id`` / ``cost`` are forwarded to :func:`assign_srus` so a caller
whose ordered items are not ``SubViewportSRU`` (e.g. the synchronous path's
live nodes) can supply the id + offload-cost accessors.
"""
count = self._manager.device_count if self.active else 1
assignments = assign_srus(srus, count, sru_id=sru_id, cost=cost)
routes: list[OffloadRoute] = []
self._routes = {}
for a in assignments:
method = select_transfer_method(a, self._capabilities, prefer_dmabuf=self._prefer_dmabuf)
route = OffloadRoute(sru_id=a.sru_id, device_index=a.device_index, transfer=method)
routes.append(route)
self._routes[a.sru_id] = route
return routes
[docs]
def route_for(self, sru_id: int) -> OffloadRoute | None:
"""The cached route for ``sru_id`` from the last :meth:`plan`, or ``None``.
``None`` means "no decision recorded" (the SRU was not in the planned
list); callers treat that as "render on the primary", today's path.
"""
return self._routes.get(sru_id)
[docs]
def transfer_for(self, route: OffloadRoute, *, width: int = 0, height: int = 0) -> CrossDeviceTransfer:
"""Build the :class:`CrossDeviceTransfer` for an offloaded ``route``.
Pairs the secondary (``route.device_index``) and primary (0) device slots
with the chosen transfer method so the rig wiring is a single seam.
"""
return CrossDeviceTransfer(
src=self._manager.slot(route.device_index),
dst=self._manager.primary,
method=route.transfer,
width=width,
height=height,
)
[docs]
def render_offloaded(self, route: OffloadRoute) -> Any:
"""Return the per-device renderer for an offloaded SRU, building it if needed.
Resolution order:
1. An explicitly attached renderer (``MultiDeviceManager.attach_renderer``)
is returned as-is (the rig may pre-build + bind one; also the unit-test
seam).
2. Otherwise, if a :attr:`_secondary_renderer_factory` was injected (rig),
lazily build the secondary :class:`SecondaryRenderContext` facade, run the
factory to construct + ``setup()`` a ``Renderer(facade)`` on the secondary
device, bind it to the slot, and return it.
3. With neither (this single-GPU box: no factory is ever injected because no
secondary device exists), raise the clear, capability-gated rig-completion
error. NEVER reached on the single-GPU / unopted path: that path has no
coordinator at all, so the seam's ``coordinator is None`` branch keeps it
byte-identical.
"""
slot = self._manager.slot(route.device_index)
if slot.renderer is not None:
return slot.renderer
ctx = self._ensure_secondary(route.device_index)
if ctx is None or ctx["facade"].renderer is None:
raise NotImplementedError(
f"Multi-GPU SubViewport offload to secondary device {route.device_index} "
f"({slot.name!r}) needs that device's own renderer (pipelines / descriptor "
"pools / transform+material SSBOs / mesh+texture residency). Inject a "
"secondary_renderer_factory (the engine does on the rig) or bind one via "
"MultiDeviceManager.attach_renderer(index, renderer); see multi_device.py "
"module docstring + design decision D8. Verified on the 4x Arc Pro B70 rig."
)
return ctx["facade"].renderer
def _ensure_secondary(self, device_index: int) -> dict | None:
"""Lazily build + cache the secondary render context for ``device_index``.
Builds (once) the :class:`SecondaryRenderContext` facade and, when a
:attr:`_secondary_renderer_factory` is present, a ``Renderer(facade)`` set up
on the secondary device + a :class:`SecondaryResidency` helper. Returns the
cached dict, or ``None`` when no factory is available (no secondary renderer
can exist; the caller raises the rig-completion error). GPU work happens only
inside the injected factory, so this stays import-safe and is never reached on
the single-GPU path.
"""
cached = self._secondary.get(device_index)
if cached is not None:
return cached
if self._secondary_renderer_factory is None:
return None
from .secondary_engine import SecondaryRenderContext, SecondaryResidency
slot = self._manager.slot(device_index)
facade = SecondaryRenderContext(
slot, capabilities=self._capabilities, content_scale=self._content_scale,
)
facade.ensure_command_pool()
# The render pass MUST exist before the factory builds Renderer(facade).setup():
# pipelines compile against facade.render_pass (it is format-compatible with the
# per-SRU RenderTarget; pipelines use dynamic viewport/scissor so extent is free).
facade.ensure_offscreen_render_pass()
renderer = self._secondary_renderer_factory(facade)
facade.attach_renderer(renderer)
cached = {"facade": facade, "residency": SecondaryResidency(facade), "target": None, "transfer": None}
self._secondary[device_index] = cached
return cached
[docs]
def render_sru_offloaded(self, sru: Any, primary_dst_image: Any) -> bool:
"""Render one SRU on its secondary device and transfer it to the primary image.
End-to-end multi-GPU SubViewport offload for one SRU (the
:class:`~simvx.graphics.renderer.render_packet.SubViewportSRU` plan):
1. Resolve the SRU's route (must be offloaded; else returns ``False`` so the
caller takes the primary path).
2. Ensure the secondary render context (facade + ``Renderer`` + residency).
3. Mirror the SRU's mesh geometry (and, when textured-residency is wired, its
sampled textures) onto the secondary device via :class:`SecondaryResidency`.
4. Size / create the secondary offscreen :class:`RenderTarget` for this SRU.
5. Record + submit the SRU's draws into that target on the secondary device
(one submit on the secondary graphics queue; the offscreen RenderTarget
leaves the colour image in ``SHADER_READ_ONLY_OPTIMAL``).
6. Run the :class:`CrossDeviceTransfer` (staging-copy floor) to move the
secondary colour image into ``primary_dst_image`` (the SubViewport's
primary-device bindless image the main scene samples).
Returns ``True`` when the SRU was handled on a secondary, ``False`` when it
was not offloaded (caller renders it on the primary as today). The actual
per-device GPU record + submit (step 5) is delegated to the injected secondary
renderer; this method owns the orchestration + the cross-device transfer.
Exercised on the 4x Arc Pro B70 rig; never reached on the single-GPU path.
"""
sru_id = getattr(sru, "sru_id", None)
route = self.route_for(sru_id) if sru_id is not None else None
if route is None or not route.offloaded:
return False
ctx = self._ensure_secondary(route.device_index)
if ctx is None or ctx["facade"].renderer is None:
# No secondary renderer -> defer to render_offloaded's clear raise so the
# caller surfaces the precise rig-completion message rather than silently
# dropping the SRU.
self.render_offloaded(route)
return False
facade = ctx["facade"]
residency = ctx["residency"]
width, height = int(sru.width), int(sru.height)
# 3. Mirror geometry the SRU draws onto the secondary device. Vertex-colour
# SRUs need only meshes; textured-SRU residency mirrors sampled textures too
# (FLAGGED: needs source pixels, see SecondaryResidency.ensure_textures). The
# source CPU arrays come from the primary registry's retained geometry (the
# engine enables retention when multi-GPU is active); a VkBuffer cannot cross
# devices, only the arrays can.
residency.ensure_meshes(self._sru_mesh_payloads(sru, self._primary_mesh_registry()))
# 4. Secondary offscreen target sized to the SRU (recreate on resize).
target = ctx["target"]
if target is None or (target.width, target.height) != (width, height):
from ..renderer.render_target import RenderTarget
if target is not None:
target.destroy()
target = RenderTarget(
facade.ctx.device, facade.ctx.physical_device, width, height,
colour_format=vk.VK_FORMAT_R16G16B16A16_SFLOAT,
use_depth=True, samplable_depth=True,
queue=facade.ctx.graphics_queue, command_pool=facade.ctx.command_pool,
)
ctx["target"] = target
ctx["transfer"] = None # size changed -> staging must be re-sized
# 5. Record + submit the SRU on the secondary renderer into ``target``. The
# injected secondary-renderer factory returns an object exposing
# ``render_sru_offscreen(sru, target)`` (duck-typed): on the rig it wraps a
# ``Renderer(facade)`` and mirrors the primary ``render_sru_from_plan`` slice
# model on its OWN device (reserve a transform-SSBO slice, upload the SRU
# transforms, record ``render_scene_content`` into ``target``, submit on the
# secondary graphics queue + wait). The core shared ``Renderer`` is NOT made
# to grow this method, so the single-GPU path stays byte-identical.
recorder = facade.renderer
# Bind residency (for primary->secondary MeshHandle remap) + mirror the
# primary material/light SSBO contents so the offloaded draws shade
# identically (set once per build is insufficient: materials change per
# frame, so mirror every frame from the primary renderer).
if hasattr(recorder, "set_residency"):
recorder.set_residency(residency)
primary_renderer = self._manager.primary.renderer
if primary_renderer is not None and hasattr(recorder, "set_materials"):
recorder.set_materials(getattr(primary_renderer, "_materials", None))
recorder.set_lights(getattr(primary_renderer, "_lights", None))
recorder.render_sru_offscreen(sru, target)
# 6. Transfer the secondary colour image into the primary bindless image.
transfer = ctx["transfer"]
if transfer is None:
transfer = self.transfer_for(route, width=width, height=height)
transfer.bytes_per_pixel = 8 # R16G16B16A16_SFLOAT
ctx["transfer"] = transfer
transfer.ensure_staging(facade.ctx.command_pool, self._primary_pool())
transfer.run(src_image=target.colour_image, dst_image=primary_dst_image)
return True
def _primary_pool(self) -> Any:
"""The primary device's command pool (for the destination half of the copy)."""
primary = self._manager.primary
renderer = primary.renderer
# The primary renderer's engine exposes ctx.command_pool; resolve via the
# attached renderer's engine when present, else the slot has no pool handle
# and the rig wiring supplies it. Kept as a single resolution seam.
eng = getattr(renderer, "_engine", None)
ctx = getattr(eng, "ctx", None)
if ctx is not None:
return ctx.command_pool
raise RuntimeError(
"primary command pool unavailable for cross-device transfer; attach the "
"primary renderer to slot 0 (MultiDeviceManager.attach_renderer(0, renderer))."
)
def _primary_mesh_registry(self) -> Any:
"""The primary renderer's :class:`MeshRegistry` (source of retained geometry).
Resolved off the primary slot's attached renderer engine
(``renderer._engine.mesh_registry``). The engine enables geometry retention
on that registry when multi-GPU is active, so :meth:`get_geometry` returns the
CPU arrays an offloaded SRU's meshes need to be mirrored onto a secondary
device. ``None`` when no primary renderer is attached (the residency mirror
then skips with a warning rather than indexing the wrong device's buffers).
"""
renderer = self._manager.primary.renderer
eng = getattr(renderer, "_engine", None)
return getattr(eng, "mesh_registry", None)
@staticmethod
def _sru_mesh_payloads(sru: Any, primary_registry: Any) -> list[tuple[int, Any, Any]]:
"""Extract ``(residency_key, vertices, indices)`` payloads from an SRU's instances.
``residency_key`` is ``id(mh)`` (the per-instance handle identity the secondary
recorder remaps against in :meth:`SecondarySRURenderer._remap_instances`), and
the CPU ``vertices``/``indices`` are fetched from the primary
:class:`~simvx.graphics.renderer.mesh_registry.MeshRegistry` by the handle's
registry ``id`` (the device-independent source arrays the registry retained
when multi-GPU is active). A handle whose geometry is not retained (no
``primary_registry`` or a registry without retention) is skipped, and the
recorder drops that instance with a warning rather than indexing the wrong
device's buffers. Dedup by ``id(mh)`` so a shared mesh mirrors once per device.
"""
if primary_registry is None or not hasattr(primary_registry, "get_geometry"):
return []
payloads: list[tuple[int, Any, Any]] = []
seen: set[int] = set()
for entry in list(getattr(sru, "instances", [])) + list(getattr(sru, "skinned_instances", [])):
mh = entry[0]
key = id(mh)
if key in seen:
continue
seen.add(key)
geom = primary_registry.get_geometry(getattr(mh, "id", None))
if geom is not None:
payloads.append((key, geom[0], geom[1]))
return payloads
[docs]
def destroy(self) -> None:
"""Tear down all lazily-built secondary render contexts (facade + target + transfer)."""
for ctx in self._secondary.values():
transfer = ctx.get("transfer")
if transfer is not None:
transfer.destroy()
target = ctx.get("target")
if target is not None:
target.destroy()
facade = ctx.get("facade")
if facade is not None:
facade.destroy()
self._secondary.clear()