Source code for simvx.graphics.renderer.forward

"""Default renderer: Vulkan forward path; implements the RendererBackend ABC."""

from __future__ import annotations

import logging
import math
import threading
from typing import TYPE_CHECKING, Any

import numpy as np
import vulkan as vk

if TYPE_CHECKING:
    from ..engine import CubemapHandle

from ..gpu.descriptors import DescriptorWriteBatch
from ..gpu.memory import upload_numpy
from ..scene.frustum import Frustum
from ..types import (
    ALPHA_BLEND,
    LIGHT_DTYPE,
    MATERIAL_DTYPE,
    MeshHandle,
    VertexStreams,
)
from ._base import RendererBackend
from .buffer_manager import SHADOW_DATA_SIZE, BufferManager
from .custom_post_process import CustomPostProcessPass
from .draw2d_pass import Draw2DPass
from .environment_sync import EnvironmentSync
from .gizmo_pass import GizmoRenderData
from .gpu_batch import GPUBatch
from .light2d_pass import Light2DPass
from .overlay_renderer import OverlayRenderer
from .pass_orchestrator import PassOrchestrator
from .pipeline_manager import PipelineManager
from .post_process import PostProcessPass
from .scene_renderer import SceneContentRenderer
from .shadow_renderer import ShadowRenderer
from .ssao_pass import SSAOPass
from .ssgi_pass import SSGIPass
from .ssr_pass import SSRPass
from .text_pass import TextPass
from .viewport_manager import ViewportManager

__all__ = ["Renderer"]

log = logging.getLogger(__name__)

# Default ambient colour: used by the IBL-only fallback path that bypasses
# the regular shadow-pass upload.
_DEFAULT_AMBIENT = np.array([0.15, 0.15, 0.2, 1.0], dtype=np.float32)



[docs]
class Renderer(RendererBackend):
    """Default Vulkan forward renderer: multi-draw indirect, per-viewport frustum culling."""

    def __init__(self, engine: Any, max_objects: int = 10_000):
        self._engine = engine

        # Subsystems
        self.viewport_manager = ViewportManager()
        self._frustum = Frustum()

        # Pipelined render thread: serialises the renderer's per-frame-state
        # critical region. The main thread holds this while it runs
        # ``begin_frame`` + ``submit_scene`` + ``extract_render_packet`` (building
        # the next frame's submission lists); the render thread holds it while it
        # installs a packet onto the per-frame attributes and records + submits
        # the GPU frame. This makes the renderer-state-coupled CPU work mutually
        # exclusive (no torn reads of ``_instances`` et al.) while the GPU
        # *execution* (post-submit) still overlaps the main thread's next sim.
        # Never taken on the synchronous default path, so that path is unchanged.
        self._frame_state_lock = threading.RLock()

        # Delegated renderers (created after __init__ state is set up)
        self._scene_renderer = SceneContentRenderer(self)
        self._shadow_renderer = ShadowRenderer(self)
        self._overlay_renderer = OverlayRenderer(self)
        self._env_sync = EnvironmentSync(self)

        # Per-frame submission lists
        self._instances: list[tuple[MeshHandle, np.ndarray, int, int, int]] = []

        # Absolute base slot of the scene-render unit currently being recorded into
        # the shared transform SSBO. 0 for the main scene; an offscreen SRU
        # (SubViewport / reflection-probe face) sets this to its reserved slice base
        # so every draw it records uses ``first_instance = base + local_index`` and
        # samples its own transforms. Restored to 0 after each offscreen SRU.
        self._first_instance_base: int = 0

        # Stable identity of the scene-render unit currently being recorded. 0 is
        # the main scene; each SubViewport node / reflection-probe face passes its
        # own stable id. Used (with viewport id) to key the frustum visibility
        # cache so two SRUs sharing viewport_id==0 never collide.
        self._sru_id: int = 0

        # Base slot of the main scene's reserved transform-SSBO slice this frame.
        # -1 = not yet reserved; set by reserve_main_slice (called before offscreen
        # SRUs so the main scene is base 0), consumed + reset by _upload_transforms.
        self._main_base: int = -1
        # Per-frame guard for ``sync_render_state``: reset in ``begin_frame`` so the
        # WorldEnvironment sync (which may rebuild HDR pipelines) runs at most once
        # per frame, at the earliest caller (the frame loop hoists it ahead of the
        # offscreen RenderView/SubViewport recording that binds those pipelines).
        self._env_synced_this_frame: bool = False

        # One-time-warned flag for the independentBlend G-buffer guard:
        # set when ``apply_gbuffer_state`` first refuses activation on a device
        # without independentBlend, so the warning is logged once, not per frame.
        self._gbuffer_guard_warned: bool = False

        # GPU resources (created in setup)
        self._pipelines = PipelineManager(engine)
        self._buffers = BufferManager(engine, max_objects)
        self._passes = PassOrchestrator(engine)
        self._batch: GPUBatch | None = None

        # Per-object TAA velocity pass: lazily created the first frame TAA is
        # enabled, so the no-TAA path allocates nothing (byte-identical). Owns its
        # own RG16F target, prev/cur model SSBOs, and a dedicated indirect batch
        # (it records after the forward draws but reads at GPU-execution time).
        self._velocity_pass: Any = None
        self._velocity_batch: GPUBatch | None = None
        # Column-major model matrices captured during the per-frame transform
        # upload (only when TAA is on) -- handed to the velocity pass as "current".
        self._velocity_models: np.ndarray | None = None
        # Per-block (base_slot, col_major_models) for multimesh draws; populated in
        # _upload_multimesh_blocks when TAA is on, consumed by _render_velocity.
        self._velocity_mm_blocks: list[tuple[int, np.ndarray]] = []
        # Scene-tree structure version at the last velocity frame: a change means
        # instance row indices may have shifted, so prev<->cur pairing is unsafe
        # for one frame (force prev == cur -> zero velocity, no spike).
        self._velocity_structure_version: int = -1

        # Material/light data (set externally via set_materials/set_lights)
        self._materials: np.ndarray = np.zeros(1, dtype=MATERIAL_DTYPE)
        self._lights: np.ndarray = np.zeros(1, dtype=LIGHT_DTYPE)

        # Scene colour/depth copy targets for the opaque/transparent pass split.
        # Created in setup(); the split only fires when a material
        # this frame declares needs_scene_colour/depth, so it is zero-cost when
        # unused. ``_scene_read_this_frame`` is derived from ``_materials`` (which
        # rides the RenderPacket in pipelined mode) each pre_render.
        self._scene_copy: Any = None
        self._scene_read_this_frame: bool = False
        # A visible WaterSurface3D was submitted this frame (drives the
        # scene-copy split like a screen-reading material). False keeps the frame
        # byte-identical (zero-cost when no water is present).
        self._water_this_frame: bool = False
        # (structure_version, WaterSurface3D nodes) memo so the water tree walk
        # only reruns when the tree structure changes (see _collect_water_surfaces).
        self._water_cache: tuple[int, tuple[Any, ...]] = (-1, ())
        # Same pattern for the FFT ocean. A visible OceanSurface3D also
        # refracts the scene, so it forces the scene-copy split; the tree walk is
        # structure-cached so a scene with no ocean pays a single integer compare.
        self._ocean_this_frame: bool = False
        self._ocean_cache: tuple[int, tuple[Any, ...]] = (-1, ())

        # Per-frame particle submissions (stored here; passes consume them)
        self._particle_submissions: list[tuple[np.ndarray, int]] = []  # (data, count)
        # (emitter_id, emitter_config) tuples: ``emitter_id`` is the stable
        # ``id(node) & 0xFFFFFFFF`` minted by ``scene_adapter`` so that
        # ``ParticleCompute`` can key per-emitter SSBOs (matches web).
        self._gpu_particle_submissions: list[tuple[int, dict]] = []

        # Per-frame Billboard2D submissions: BILLBOARD_DTYPE rows for Sprite3D
        # quads + Text3D glyphs. Consumed by the depth-tested
        # Billboard2DPass inside render_scene_content; cleared in pre_render.
        self._billboard_submissions: list[np.ndarray] = []
        # Stable bindless slot of the MSDF atlas for Text3D glyph billboards
        # (registered lazily via msdf_atlas_slot(); -1 until first use).
        self._msdf_atlas_slot: int = -1
        self._msdf_atlas_view: Any = None

        # Active Camera2D mapping for this frame as ``(affine, screen_size)`` where
        # ``affine`` is the world->screen ``(a,b,c,d,tx,ty)`` and ``screen_size`` is the
        # logical viewport the affine was built against; ``None`` when no 2D camera is
        # active. Set by ``SceneAdapter._submit_2d_overlays`` and read by the particle
        # overlay so 2D particles share the sprites' canvas_transform.
        self._camera2d_affine: tuple[tuple[float, ...], tuple[float, float]] | None = None

        # Per-frame ShaderMaterial-backed submissions. Each entry is
        # (mesh_handle, transform, material_id, shader_material). SceneAdapter
        # populates this when a MeshInstance3D carries a custom shader;
        # ContentRenderer.render picks up the bucket and issues draws with
        # the per-material pipeline (lazy-compiled + cached via the manager).
        self._shader_material_submissions: list = []
        self._shader_material_manager: Any = None

        # Render-thread overrides for the subsystem submission buffers. ``None``
        # (default / synchronous path) means the pass reads its OWN live
        # submission list, byte-identical to the non-pipelined behaviour. In
        # pipelined mode ``install_packet`` binds the packet's OWNED copies here
        # so the render thread never reads the live subsystem lists (tilemap /
        # 2D-light / 3D-overlay text) the main thread is concurrently clearing +
        # rebuilding.
        self._packet_tilemap_layers: list | None = None
        self._packet_light2d_lights: list | None = None
        self._packet_light2d_occluders: list | None = None
        # Ordered SubViewport SRU plans the render thread replays (producers
        # first). ``None`` in the synchronous path (which walks the live tree in
        # ``pre_render``). Bound by ``install_packet`` in pipelined mode.
        self._packet_subviewport_srus: list | None = None

        # 2D item-pipeline submit: the only 2D path. The frame
        # loop publishes an immutable item view + the active Camera2D affine and
        # binds them here (sync path: ``set_item_view``; pipelined:
        # ``install_packet`` from the packet); ``render`` submits them through the
        # bindless co-batched submitter. ``None`` when the scene draws no 2D this
        # frame. The submitter is created lazily.
        self._packet_item_view: Any = None
        self._packet_item_camera: tuple = (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
        self._item_submitter: Any = None
        # Bindless co-batched item submit + its owned pass (lazily created).
        self._bindless_submitter: Any = None
        self._bindless_pass: Any = None
        # 2D-in-HDR: a second bindless pass bound to the HDR target's render
        # pass, used to draw the world-space 2D lane into the HDR colour buffer
        # before tonemap so it gets exposure/tonemap/bloom (lazily created when
        # post-processing is enabled and the item path is active).
        self._hdr_bindless_submitter: Any = None
        self._hdr_bindless_pass: Any = None
        # Pure-2D bloom: ``_has_world_env`` is set by
        # env_sync when a WorldEnvironment node exists; ``_hdr_2d_only`` marks a
        # frame whose HDR pass carried ONLY 2D (no 3D) so the tonemap blit uses
        # linear (no ACES crush of flat 2D art).
        self._has_world_env: bool = False
        self._hdr_2d_only: bool = False

        # Per-CanvasLayer post. ``_layer_post_specs`` maps band ->
        # LayerPostSpec for every CanvasLayer whose ``environment`` is set AND
        # carries an applicable effect; populated by env_sync ONLY when such a layer
        # exists (empty dict otherwise, the zero-cost-when-unused contract). The
        # per-band HDR target + bloom + composite (LayerPostChain) is created
        # lazily, the first frame a band is opted-in AND present this frame, and
        # cached here by band; an empty specs dict means the cache stays empty and
        # the renderer takes the EXACT existing global path.
        self._layer_post_specs: dict[int, Any] = {}
        self._layer_post_chains: dict[int, Any] = {}
        # Post bands that drew this frame (ascending), computed in pre_render and
        # consumed by render() to interleave composites with the global 2D submit.
        self._frame_post_bands: list[int] = []

        # Reflection probes: captured local IBL bound at forward bindings 10/11/12.
        # Created lazily in setup() so the shared cubemap arrays exist before the
        # first probe captures.
        self._reflection_probe_pass: Any = None
        self._probe_descriptors_bound: bool = False

        # Irradiance volumes: SH-L1 diffuse GI bound at forward
        # binding 18. Created lazily in setup(); ``_irradiance_volume_enabled``
        # gates the uber SH sample (0 until a volume bakes, so feature-off is
        # byte-identical).
        self._irradiance_volume_pass: Any = None
        self._irradiance_volume_enabled: bool = False
        # Decals: whether any Decal3D was packed last frame, so a
        # scene that clears its decals uploads one final zeroed buffer then stops.
        self._decals_this_frame: bool = False

        # GPU Hi-Z occlusion culling gate, driven by WorldEnvironment via
        # EnvironmentSync. Off by default: when off nothing here is allocated or
        # dispatched and the default render path is byte-identical.
        self._occlusion_culling_enabled: bool = False
        # Lazily-created two-phase occlusion machinery. None until the first
        # occlusion-enabled frame; ``_occlusion_pass`` is the GPU phase-1/phase-2
        # cull, ``_depth_prepass`` the scratch-depth predicted-occluder pass, and
        # the two indirect batches (opaque, double-sided) are drawn TWICE per frame
        # (depth prepass: set A only; colour pass: A + set-B survivors).
        self._occlusion_pass: Any = None
        self._depth_prepass: Any = None
        self._occ_batch_opaque: GPUBatch | None = None
        self._occ_batch_double: GPUBatch | None = None
        # Two uint32[max_objects] visibility STORAGE buffers, ping-ponged by frame
        # parity: vis_prev[parity] is last frame's drawn set (set A seed), vis_next
        # the set this frame produces. (buf, mem) tuples; None until first enabled
        # frame. Filled with 1 (all visible) on first frame / resize / scene-
        # structure change so phase 1 draws everything and phase 2 culls nothing.
        self._vis_buffers: list[tuple[Any, Any]] = []
        self._vis_parity: int = 0
        # Scene-structure version at the last occlusion frame: a change means
        # instance row indices may have shifted, so vis_prev is stale -> refill all
        # visible for one frame (conservative, mirrors the velocity pass guard).
        self._occ_structure_version: int = -2
        # True once the Hi-Z pyramid has been built at least once. The phase-2 cull
        # keeps every set-B candidate until then (frame 0 / the frame after a
        # resize) so nothing is falsely culled against an empty pyramid.
        self._hiz_built_once: bool = False
        # Telemetry: sum of instance_count over the culled indirect commands of
        # the last occlusion dispatch (read back from the host-visible buffer).
        self._last_drawn_instance_count: int = 0

        # IBL enabled once a skybox cubemap is set
        self._ibl_enabled: bool = False
        # The IBLPass that precomputed the current sky's irradiance / prefiltered
        # specular / BRDF LUT. Retained (not cleaned up after process) so the
        # forward shader can sample the maps; replaced on each new skybox.
        self._ibl_pass: Any = None
        self._ibl_handle_id: int | None = None
        # Ambient lighting, driven by WorldEnvironment via EnvironmentSync.
        # Defaults match the legacy static _AMBIENT so a scene with no
        # WorldEnvironment renders identically to before. ``_ambient_energy``
        # scales the sky IBL ambient (ambient_colour.a in the shadow SSBO).
        self._ambient_colour: tuple[float, float, float] = (0.15, 0.15, 0.2)
        self._ambient_energy: float = 1.0
        # Pluggable ambient. ``_ambient_mode`` is the tier
        # ceiling packed into the shadow SSBO (0 = probe = today's full layered
        # ambient, 1 = ibl, 2 = flat), driven by WorldEnvironment.ambient_mode.
        # The hook gates are flipped by the SSR / SSGI
        # producers; both False means set0 b16/b17 are never sampled.
        self._ambient_mode: int = 0
        self._indirect_specular_enabled: bool = False
        self._indirect_diffuse_enabled: bool = False

        # Screen-space reflections. Lazily created the first
        # frame SSR is active (like SSAO/velocity); writes the reflected radiance
        # into the b16 indirect-specular hook. None + ``_indirect_specular_enabled``
        # False keeps every SSR-off frame byte-identical (b16 stays the placeholder,
        # the uber never samples it). Driven by ``apply_ssr_state`` from env-sync.
        self._ssr_pass: SSRPass | None = None

        # Screen-space global illumination: the diffuse sibling
        # of SSR. Lazily created the first frame SSGI is active; writes the bounced
        # indirect diffuse into the b17 indirect-diffuse hook. None +
        # ``_indirect_diffuse_enabled`` False keeps every SSGI-off frame
        # byte-identical (b17 stays the placeholder). Driven by ``apply_ssgi_state``.
        self._ssgi_pass: SSGIPass | None = None

        # Debug view mode: a global diagnostic that makes the
        # uber shader output one surface channel instead of the lit result. 0 is
        # off (byte-identical to the lit path); set via the ``debug_view``
        # property. Packed into the shadow SSBO at offset 368.
        self._debug_view: int = 0

        # Reflection-probe quality dials, written by
        # the env-sync spec each frame. None = built-in defaults (top-2 blend,
        # 128 px faces): the exact pre-knob code paths. Transitions are applied
        # by ``apply_probe_quality`` (idempotent, called from env sync).
        self._probe_blend_count: int | None = None
        self._probe_face_size: int | None = None

        # Main-view resolution scale. Sizes the whole HDR
        # chain at ``ceil(output_extent * render_scale)``; the tonemap fullscreen
        # draw bilinearly upscales to the swapchain. 1.0 keeps every extent and
        # code path byte-identical to the unscaled renderer. Driven by
        # ``WorldEnvironment.render_scale`` through the env-sync spec; the
        # ``render_scale`` property applies the transition (device drain + HDR
        # chain rebuild, the same idiom as a window resize / G-buffer toggle).
        self._render_scale: float = 1.0

        # WorldEnvironment-sourced wind/wetness for the FrameGlobals UBO.
        # Written each frame by EnvironmentSync; calm/dry defaults keep a no-env
        # scene feature-off and byte-identical.
        from .. import frame_globals as _fg
        self._frame_globals_env = _fg.FrameGlobalsEnv()

        # Skinned mesh instances (mesh_handle, transform, material_id, joint_matrices)
        self._skinned_instances: list[tuple[MeshHandle, np.ndarray, int, np.ndarray]] = []

        # MultiMesh blocks submitted this frame as vectorized (N,4,4) entries
        # (mesh_handle, transforms, material_id, material_ids|None, viewport_id) -- NOT
        # exploded into per-instance _instances. _prepare_multimesh expands only
        # transparent blocks back into _instances; opaque blocks always stay fast-path.
        # _multimesh_draws holds the resolved draw records:
        #   (mesh_handle, base_slot, count, viewport_id, double_sided, centres, radii,
        #    union_aabb_min, union_aabb_max)
        # union_aabb_min/max are (3,) float32 arrays for coarse occlusion culling;
        # None when occlusion culling is disabled (saves compute).
        self._multimesh_blocks: list[tuple[MeshHandle, np.ndarray, int, np.ndarray | None, int, int, int]] = []
        self._multimesh_draws: list[
            tuple[MeshHandle, int, int, int, bool, np.ndarray, np.ndarray, np.ndarray | None, np.ndarray | None]
        ] = []
        self._multimesh_batch: GPUBatch | None = None
        # Static upload-skip cache: cache_key -> (version, base, count, centres, radii).
        # A static block whose version + reserved base are unchanged keeps its
        # transforms already resident in the SSBO, so the per-frame pack + upload is
        # skipped. Invalidated wholesale on a transform-arena grow (slots move).
        # cache_key -> (version, base, count, centres, radii, union_min, union_max,
        # resident_frames): resident_frames is the set of frame-in-flight slots whose
        # ringed transform SSBO already holds this block, so the upload is skipped
        # only for a slot that already has it.
        self._mm_upload_cache: dict[int, tuple[Any, ...]] = {}
        # Cache keys touched this / last frame. Pruning is frame-based (not per
        # upload call) so multiple passes in a frame -- main scene + each
        # SubViewport SRU + reflection-probe faces -- each refresh their own keys
        # without evicting another pass's static blocks.
        self._mm_seen_this_frame: set[int] = set()
        self._mm_seen_last_frame: set[int] = set()
        # Latch: was a MultiMesh-only scene casting shadows last frame? Lets the
        # shadow passes run one extra (clearing) frame after the only caster is
        # removed, so stale shadow-SSBO state can't linger.
        self._mm_shadow_active: bool = False

        # Debug line rendering (OverlayRenderer lazily populates these)
        self._debug_pipeline: Any = None
        self._debug_pipeline_layout: Any = None
        self._debug_vb: Any = None
        self._debug_vb_mem: Any = None
        self._debug_vb_capacity: int = 0
        self._debug_vert_module: Any = None
        self._debug_frag_module: Any = None

        # Track whether HDR was rendered this frame (guards tonemap)
        self._hdr_rendered = False

        self._ready = False

    # -- Pipeline accessors (delegate to PipelineManager; keep attribute names
    # stable so SceneContentRenderer and other consumers keep working).

    @property
    def _pipeline(self) -> Any:
        return self._pipelines.opaque

    @property
    def _pipeline_layout(self) -> Any:
        return self._pipelines.opaque_layout

    @property
    def _nocull_pipeline(self) -> Any:
        return self._pipelines.nocull

    @property
    def _nocull_pipeline_layout(self) -> Any:
        return self._pipelines.nocull_layout

    @property
    def _transparent_pipeline(self) -> Any:
        return self._pipelines.transparent

    @property
    def _transparent_pipeline_layout(self) -> Any:
        return self._pipelines.transparent_layout

    @property
    def _skinned_pipeline(self) -> Any:
        return self._pipelines.skinned

    @property
    def _skinned_pipeline_layout(self) -> Any:
        return self._pipelines.skinned_layout

    @property
    def _vert_module(self) -> Any:
        return self._pipelines.vert_module

    @property
    def _frag_module(self) -> Any:
        return self._pipelines.frag_module

    @property
    def _skinned_vert_module(self) -> Any:
        return self._pipelines.skinned_vert_module

    # -- Buffer accessors (delegate to BufferManager; keep attribute names
    # stable so ShadowRenderer, SceneContentRenderer and OverlayRenderer
    # continue to access `r._shadow_mem`, `r._transform_mem` etc. unchanged).

    @property
    def _ssbo_layout(self) -> Any:
        return self._buffers.ssbo_layout

    @property
    def _ssbo_set(self) -> Any:
        return self._buffers.ssbo_set

    @property
    def _max_objects(self) -> int:
        """Live transform-SSBO capacity in slots (grows via the frame arena)."""
        return self._buffers.max_objects

    @property
    def _transform_mem(self) -> Any:
        return self._buffers.transform_mem

    @property
    def _material_mem(self) -> Any:
        return self._buffers.material_mem

    @property
    def _light_mem(self) -> Any:
        return self._buffers.light_mem

    @property
    def _shadow_buf(self) -> Any:
        return self._buffers.shadow_buf

    @property
    def _shadow_mem(self) -> Any:
        return self._buffers.shadow_mem

    @property
    def _joint_layout(self) -> Any:
        return self._buffers.joint_layout

    @property
    def _joint_set(self) -> Any:
        return self._buffers.joint_set

    @property
    def _joint_mem(self) -> Any:
        return self._buffers.joint_mem

    @property
    def _max_materials(self) -> int:
        return self._buffers.max_materials

    # -- Pass accessors (delegate to PassOrchestrator; keep old names stable).

    @property
    def _post_process(self) -> PostProcessPass | None:
        return self._passes.post_process

    @property
    def _custom_pp(self) -> CustomPostProcessPass | None:
        return self._passes.custom_pp

    @property
    def _ssao_pass(self) -> SSAOPass | None:
        return self._passes.ssao_pass

    @property
    def _hiz_pass(self) -> Any:
        return self._passes.hiz_pass

    @property
    def _volumetric_fog_pass(self) -> Any:
        return self._passes.volumetric_fog_pass

    @property
    def _taa_pass(self) -> Any:
        return self._passes.taa_pass

    @property
    def _skybox_pass(self) -> Any:
        return self._passes.skybox_pass

    @property
    def _shadow_pass(self) -> Any:
        return self._passes.shadow_pass

    @property
    def _point_shadow_pass(self) -> Any:
        return self._passes.point_shadow_pass

    @property
    def _particle_pass(self) -> Any:
        return self._passes.particle_pass

    @property
    def _particle_compute(self) -> Any:
        return self._passes.particle_compute

    @_particle_compute.setter
    def _particle_compute(self, value: Any) -> None:
        """OverlayRenderer lazily creates the compute pass on first GPU submission."""
        self._passes.particle_compute = value

    @property
    def _billboard2d_pass(self) -> Any:
        return self._passes.billboard2d_pass

    @property
    def _tilemap_pass(self) -> Any:
        return self._passes.tilemap_pass

    @property
    def _water_pass(self) -> Any:
        return self._passes.water_pass

    @property
    def _ocean_pass(self) -> Any:
        return self._passes.ocean_pass

    @property
    def _light2d_pass(self) -> Light2DPass | None:
        return self._passes.light2d_pass

    @property
    def _text_pass(self) -> TextPass | None:
        return self._passes.text_pass

    @property
    def _draw2d_pass(self) -> Draw2DPass | None:
        return self._passes.draw2d_pass


[docs]
    @property
    def draw2d_draw_count(self) -> int:
        """The 2D draw count of the last main-framebuffer frame (telemetry).

        The item pipeline submits through the bindless co-batched pass, so its
        ``last_frame_draw_count`` is the live count; ``0`` before the pass is
        first created (a scene that drew no 2D yet)."""
        bp = self._bindless_pass
        return int(bp.last_frame_draw_count) if bp is not None else 0



[docs]
    def set_item_view(self, view: Any, camera: tuple) -> None:
        """Bind a published 2D item view + camera affine for this frame's submit.

        The synchronous-path entry point: the frame loop calls this after
        publishing the item view, and ``render`` submits it through the bindless
        co-batched submitter. ``view=None`` means no 2D this frame. The pipelined
        path binds the same fields via ``install_packet`` from the packet's
        ``item_view``.
        """
        self._packet_item_view = view
        self._packet_item_camera = camera


    def _ensure_item_submitter(self) -> Any:
        """Lazily create the bindless co-batched item submitter.

        Builds a :class:`BindlessDraw2DPass` (the unified ``ui2d`` pipeline with
        per-vertex texture_id + is_msdf) against the swapchain render pass and
        wraps it in a :class:`BindlessItemSubmitter`. A run of consecutive items
        sharing only (clip, blend) -- across textures and sprite/glyph --
        collapses into ONE draw.
        """
        if self._bindless_submitter is None and self._draw2d_pass is not None:
            from ..render2d.submit import BindlessItemSubmitter
            from .bindless_draw2d_pass import BindlessDraw2DPass

            bpass = BindlessDraw2DPass(
                self._engine, text_pass=self._text_pass, light2d_pass=self._light2d_pass,
            )
            bpass.setup(render_pass=self._engine.render_pass, extent=self._engine.extent)
            self._bindless_pass = bpass
            self._bindless_submitter = BindlessItemSubmitter(bpass)
        return self._bindless_submitter

    def _ensure_hdr_item_submitter(self) -> Any:
        """Lazily create the HDR-target bindless item submitter (2D-in-HDR).

        Mirrors :meth:`_ensure_item_submitter` but binds the pass to the post-process
        HDR target's render pass (``R16G16B16A16_SFLOAT`` colour + depth), so the
        world-space 2D lane draws into the linear HDR buffer *before* tonemap. The
        unified ``ui2d`` pipeline is depth-test/-write OFF, so it is render-pass
        compatible with the HDR pass (which carries a depth attachment for the 3D
        geometry just drawn) without touching depth. Returns ``None`` until
        post-processing has built its HDR target.
        """
        pp = self._post_process
        if pp is None or not pp.enabled or pp.hdr_target is None:
            return None
        if self._hdr_bindless_submitter is None and self._draw2d_pass is not None:
            from ..render2d.submit import BindlessItemSubmitter
            from .bindless_draw2d_pass import BindlessDraw2DPass

            rt = pp.hdr_target
            bpass = BindlessDraw2DPass(
                self._engine, text_pass=self._text_pass, light2d_pass=self._light2d_pass,
            )
            bpass.setup(
                render_pass=rt.render_pass, extent=(rt.width, rt.height),
                gbuffer=bool(getattr(self._engine, "gbuffer_active", False)),
            )
            self._hdr_bindless_pass = bpass
            self._hdr_bindless_submitter = BindlessItemSubmitter(bpass)
        return self._hdr_bindless_submitter

    def _present_post_bands(self) -> list[int]:
        """Opted-in post bands that actually drew this frame, ascending.

        Returns ``[]`` unless ``_layer_post_specs`` is non-empty (the armed gate),
        so the per-frame cost on every scene that uses no per-layer environment is a
        single dict-empty check. When armed it does ONE ``np.unique`` over the
        published ``layer`` column and intersects with the opted-in bands.
        """
        if not self._layer_post_specs:
            return []
        view = self._packet_item_view
        if view is None or view.count == 0:
            return []
        present = {int(b) for b in np.unique(view.columns["layer"])}
        return sorted(present & self._layer_post_specs.keys())

    def _ensure_layer_post_chain(self, band: int) -> Any:
        """Get/create the per-band post chain (lazy, mirrors _ensure_hdr_item_submitter)."""
        chain = self._layer_post_chains.get(band)
        if chain is None:
            from .layer_post import LayerPostChain

            chain = LayerPostChain(self._engine, band, self._text_pass, light2d_pass=self._light2d_pass)
            self._layer_post_chains[band] = chain
        chain.configure(self._layer_post_specs[band])
        return chain

    def _reap_stale_layer_post_chains(self, live_bands: set[int]) -> None:
        """Destroy per-band chains whose band is no longer opted-in."""
        for band in [b for b in self._layer_post_chains if b not in live_bands]:
            self._layer_post_chains.pop(band).cleanup()

    def _has_world_2d(self) -> bool:
        """Whether this frame has any HDR-lane (world-space) 2D item (pure-2D bloom).

        Vectorised scan of the published item view's ``flags`` column for the HDR
        lane (same by-role + override rule as ``submit.item_in_hdr_lane``). Only
        consulted by ``_wants_2d_post`` (which the HDR-gate short-circuits for any
        3D scene), so it costs nothing on the 3D path or when post is off.
        """
        view = self._packet_item_view
        if view is None or view.count == 0:
            return False
        from ..render2d.item_list import ItemFlags

        f = view.columns["flags"]
        opt_in = (f & int(ItemFlags.HDR_OPT_IN)) != 0
        opt_out = (f & int(ItemFlags.HDR_OPT_OUT)) != 0
        screen = (f & int(ItemFlags.SCREEN_SPACE)) != 0
        return bool((opt_in | (~screen & ~opt_out)).any())

    def _wants_2d_post(self) -> bool:
        """Whether a 2D-only scene should enter the HDR pass for post FX (pure-2D).

        True only when a WorldEnvironment node enabled at least one 2D-applicable
        post effect AND there is world-space 2D this frame. The applicable effects
        are the screen-space, depth-independent ones that run in the tonemap pass
        (bloom, vignette, film grain, chromatic aberration, colour-grade LUT, FXAA);
        the depth/3D-only effects (SSAO, DoF, motion blur, fog) are NOT counted.
        Requires ``_has_world_env`` (the opt-in signal) since the renderer keeps its
        post setup defaults with no env, so the machinery stays off for every 2D
        scene that did not explicitly ask for it (zero cost when unused).
        """
        pp = self._post_process
        if not (pp and pp.enabled and self._has_world_env):
            return False
        any_2d_effect = (
            pp.bloom_enabled
            or pp.vignette_enabled
            or pp.grain_enabled
            or pp.chromatic_aberration_enabled
            or pp.fxaa_enabled
            or pp.lut_enabled
            or pp.crt_enabled
            or pp.pixelate_enabled
            or pp.blur_enabled
        )
        return bool(any_2d_effect and self._has_world_2d())

    @property
    def _grid_pass(self) -> Any:
        return self._passes.grid_pass

    @property
    def _gizmo_pass(self) -> Any:
        return self._passes.gizmo_pass

    @property
    def _gizmo_render_data(self) -> GizmoRenderData | None:
        return self._passes.gizmo_render_data

    @_gizmo_render_data.setter
    def _gizmo_render_data(self, value: GizmoRenderData | None) -> None:
        self._passes.gizmo_render_data = value


[docs]
    def setup(self) -> None:
        """Initialize GPU resources: called once after engine Vulkan init."""
        e = self._engine
        device = e.ctx.device
        phys = e.ctx.physical_device

        # SSBOs + descriptor sets + IBL cubemap placeholder
        self._buffers.setup()
        upload_numpy(device, self._buffers.material_mem, self._materials)
        upload_numpy(device, self._buffers.light_mem, self._lights)

        # Scene colour/depth copy: bind the 1x1 fallback to set0
        # b14/b15 so they are never unbound. The full-size textures are allocated
        # lazily the first frame a screen-reading material appears.
        from .scene_copy import SceneCopyTargets

        self._scene_copy = SceneCopyTargets(e)
        self._scene_copy.setup_fallback(self._buffers.ssbo_sets)

        # Batch renderers: separate indirect buffers so opaque/transparent
        # don't overwrite each other before the GPU executes draw commands.
        use_mdi = getattr(e, "_has_mdi", True)
        self._batch = GPUBatch(e, device, phys, max_draws=self._max_objects, use_mdi=use_mdi)
        self._transparent_batch = GPUBatch(e, device, phys, max_draws=1000, use_mdi=use_mdi)
        # MultiMesh fast-path: coalesced instanced draws live in their own batch
        # (one command per contiguous block-run), drawn after the per-instance passes.
        self._multimesh_batch = GPUBatch(e, device, phys, max_draws=1000, use_mdi=use_mdi)

        # 3D graphics pipelines (opaque, double-sided, transparent, skinned)
        self._pipelines.setup(self._buffers.ssbo_layout, self._buffers.joint_layout)

        # All render passes (shadow/particle/tilemap/2D/post-process/SSAO/fog/...)
        self._passes.setup(self._buffers.ssbo_layout, self._pipelines)

        # Reflection-probe pass: owns the shared cubemap arrays + box SSBO.
        from .reflection_probe_pass import ReflectionProbePass

        self._reflection_probe_pass = ReflectionProbePass(e)
        self._reflection_probe_pass.setup()
        # Bind the probe arrays once (capture writes into them in-place).
        self._buffers.write_probe_descriptors(
            self._reflection_probe_pass.get_irradiance_array_view(),
            self._reflection_probe_pass.get_prefilter_array_view(),
            self._reflection_probe_pass.get_sampler(),
        )

        # Irradiance-volume pass: SH-reduce compute + capture
        # scratch. Bakes IrradianceVolume3D grids into the b18 SH SSBO (bound by
        # BufferManager). Feature-off (no volume) is byte-identical.
        from .irradiance_volume_pass import IrradianceVolumePass

        self._irradiance_volume_pass = IrradianceVolumePass(e)
        self._irradiance_volume_pass.setup()

        self._ready = True

        # Render scale set before setup (programmatic use; the env-sync route
        # always lands after setup): the passes above were built at the output
        # extent, so rebuild the HDR chain once at the internal extent now.
        if self._render_scale != 1.0:
            self.resize(*e.extent)



[docs]
    def set_skybox(self, cubemap: CubemapHandle) -> None:
        """Set a cubemap as the skybox and run the IBL precompute.

        Accepts a :class:`~simvx.graphics.engine.CubemapHandle` returned by
        :meth:`Engine.load_cubemap`. The renderer takes ownership of the
        underlying Vulkan resources and destroys them at shutdown. IBL
        precompute (irradiance + prefiltered specular + BRDF LUT) is run
        once per unique handle and cached, mirroring the web renderer's
        ``IBLPass._byCubemapId`` lazy-precompute behaviour. Re-setting the
        same handle is a no-op past the cache check.
        """
        from ..engine import CubemapHandle

        if not isinstance(cubemap, CubemapHandle):
            raise TypeError(
                f"set_skybox expects a CubemapHandle (from Engine.load_cubemap), " f"got {type(cubemap).__name__}",
            )
        self._passes.set_skybox(
            cubemap.view,
            cubemap.sampler,
            self._buffers,
            cubemap.image,
            cubemap.memory,
        )
        self._ibl_enabled = True
        # IBL precompute: cached by ``CubemapHandle`` identity. The
        # ``IBLPass`` is a one-shot operation per cubemap (irradiance +
        # prefiltered specular sampled into renderer-side textures) so we
        # avoid redoing it when the same handle is re-installed.
        if self._ibl_handle_id != id(cubemap):
            try:
                from .ibl_pass import IBLPass
            except ImportError:
                # IBL module not bundled (e.g. minimal builds): skybox is
                # still set, just without environment lighting.
                self._ibl_handle_id = id(cubemap)
                return
            # Destroy the previous sky's maps before precomputing the new ones.
            if self._ibl_pass is not None:
                self._ibl_pass.cleanup()
                self._ibl_pass = None
            ibl = IBLPass(self._engine)
            ibl.setup()
            ibl.process_cubemap(cubemap.view, cubemap.sampler)
            # Retain the pass: its irradiance / prefiltered / BRDF views are
            # bound to the forward descriptor set (bindings 7/8/9) so the shader
            # can sample them in the split-sum IBL path.
            self._ibl_pass = ibl
            self._ibl_handle_id = id(cubemap)
            self._buffers.write_ibl_descriptors(
                ibl.get_irradiance_view(),
                ibl.get_prefiltered_view(),
                ibl.get_brdf_lut_view(),
                ibl.get_sampler(),
            )



[docs]
    @property
    def post_processing(self) -> PostProcessPass | None:
        """Access post-processing pass for configuration."""
        return self._post_process



[docs]
    @property
    def custom_post_processing(self) -> CustomPostProcessPass | None:
        """Access custom user post-process pass for configuration."""
        return self._custom_pp



[docs]
    def set_gizmo_data(self, data: GizmoRenderData | None) -> None:
        """Set gizmo render data for the current frame (or None to hide)."""
        self._gizmo_render_data = data



[docs]
    @property
    def ssao(self) -> SSAOPass | None:
        """Access SSAO pass for configuration."""
        return self._ssao_pass



[docs]
    def set_materials(self, materials: np.ndarray) -> None:
        """Set material array and upload to GPU (skips if unchanged)."""
        self._materials = self._buffers.set_materials(materials)



[docs]
    def set_lights(self, lights: np.ndarray) -> None:
        """Set light array and upload to GPU (skips if unchanged).

        Prepends uint32 light_count to match GLSL LightBuffer layout:
          [uint32 count][Light[0]][Light[1]]...
        """
        self._lights = lights
        self._buffers.set_lights(lights)


    def _directional_sun(self) -> tuple[Any, Any, float]:
        """Return (direction, colour, intensity) of the first directional light.

        Direction is the light's travel direction (w<0.5 marks directional in
        ``LIGHT_DTYPE``). Falls back to a default sun when none is present.
        """
        lights = self._lights
        if lights is not None and len(lights) > 0:
            for i in range(len(lights)):
                if lights[i]["position"][3] < 0.5:  # directional marker
                    direction = lights[i]["position"][:3].copy()
                    colour = lights[i]["colour"][:3].copy()
                    intensity = float(lights[i]["colour"][3])
                    if float(np.linalg.norm(direction)) > 1e-6:
                        return direction, colour, intensity
        return (np.array([-1.0, -1.0, -1.0], dtype=np.float32), np.array([1.0, 1.0, 1.0], dtype=np.float32), 1.0)

    def _collect_fog_volumes(self) -> list[Any]:
        """Collect all visible ``FogVolume3D`` nodes from the scene tree."""
        from simvx.core.fog_volume import FogVolume3D

        from ..scene_adapter import find_all_outside_subviewports

        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        root = getattr(tree, "root", None) if tree else None
        if root is None:
            return []
        return [n for n in find_all_outside_subviewports(root, FogVolume3D) if getattr(n, "visible", True)]

    def _collect_water_surfaces(self) -> list[Any]:
        """Collect the ``WaterSurface3D`` nodes from the scene tree (structure-cached).

        The tree walk is memoised by the tree's structure version so a scene with
        no water re-walks only when nodes are added/removed, not every frame:
        steady-state cost on the common (water-free) path is a single integer
        compare (zero-cost when unused). Visibility is re-checked each
        call (it is not a structural change); node transforms are read live from
        the cached nodes by the caller."""
        from simvx.core.water import WaterSurface3D

        from ..scene_adapter import find_all_outside_subviewports

        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        root = getattr(tree, "root", None) if tree else None
        if root is None:
            self._water_cache = (-1, ())
            return []
        version = int(getattr(tree, "_structure_version", 0))
        if self._water_cache[0] != version:
            self._water_cache = (version, tuple(find_all_outside_subviewports(root, WaterSurface3D)))
        return [n for n in self._water_cache[1] if getattr(n, "visible", True)]

    def _submit_water_surfaces(self) -> None:
        """Pack the visible water surfaces into the water pass for this frame.

        Sets ``_water_this_frame`` so the opaque/transparent pass split fires
        (the refraction reads the scene colour/depth copy). With no visible
        water surface the pass stays empty and the split never happens, so the
        frame is byte-identical (zero-cost when unused)."""
        from simvx.core.math.matrices import mat4_from_trs

        self._water_this_frame = False
        wp = self._water_pass
        if wp is None:
            return
        surfaces = self._collect_water_surfaces()
        for node in surfaces:
            model = mat4_from_trs(node.world_position, node.world_rotation, node.world_scale)
            # Planar-reflection feed: resolve the PlanarReflection3D's live
            # bindless index (-1 when absent / not yet captured). The capture is
            # scheduled by SceneTargetGraph before the main pass, so the slot is
            # valid; a surface with no reflection keeps the cubemap path.
            reflection = getattr(node, "reflection", None)
            reflection_tex = int(getattr(reflection, "texture", -1)) if reflection is not None else -1
            wp.submit(model, node.material, int(node.subdivisions), tuple(node.size), reflection_tex)
        self._water_this_frame = bool(surfaces)

    def _collect_ocean_surfaces(self) -> list[Any]:
        """Collect the visible ``OceanSurface3D`` nodes (structure-version cached).

        Mirrors :meth:`_collect_water_surfaces`: a scene with no ocean re-walks
        only when the tree structure changes, so the common path is a single
        integer compare (zero-cost when unused)."""
        from simvx.core.ocean import OceanSurface3D

        from ..scene_adapter import find_all_outside_subviewports

        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        root = getattr(tree, "root", None) if tree else None
        if root is None:
            self._ocean_cache = (-1, ())
            return []
        version = int(getattr(tree, "_structure_version", 0))
        if self._ocean_cache[0] != version:
            self._ocean_cache = (version, tuple(find_all_outside_subviewports(root, OceanSurface3D)))
        return [n for n in self._ocean_cache[1] if getattr(n, "visible", True)]

    def _submit_ocean_surfaces(self) -> None:
        """Pack the visible ocean surfaces into the ocean pass for this frame.

        Sets ``_ocean_this_frame`` so the opaque/transparent split fires (the
        refraction reads the scene colour/depth copy). Empty with no ocean, so
        the frame stays byte-identical."""
        from simvx.core.math.matrices import mat4_from_trs

        self._ocean_this_frame = False
        op = self._ocean_pass
        if op is None:
            return
        surfaces = self._collect_ocean_surfaces()
        for node in surfaces:
            model = mat4_from_trs(node.world_position, node.world_rotation, node.world_scale)
            op.submit(model, node.material, int(node.subdivisions), tuple(node.size))
        self._ocean_this_frame = bool(surfaces)

    def _collect_reflection_probes(self) -> list[Any]:
        """Collect all visible ``ReflectionProbe3D`` nodes from the scene tree."""
        from simvx.core.reflection_probe import ReflectionProbe3D

        from ..scene_adapter import find_all_outside_subviewports

        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        root = getattr(tree, "root", None) if tree else None
        if root is None:
            return []
        return [n for n in find_all_outside_subviewports(root, ReflectionProbe3D) if getattr(n, "visible", True)]


[docs]
    def capture_reflection_probes(self, cmd: Any, adapter: Any) -> bool:
        """Capture any new / requested reflection probes. Returns True if any captured.

        Driven from the app's pre_render hook (before the main render pass) so a
        capture this frame is visible the same frame. Probe faces, the per-probe
        IBL convolution, and the cube-array copy all record into the SAME primary
        frame command buffer *cmd*: there is no separate submit/wait, so an
        always-mode probe no longer stalls the queue every frame. Probe face
        rendering reuses the offscreen scene path and clobbers per-frame
        submission lists (which render_to_target saves/restores).
        """
        if not self._ready or self._reflection_probe_pass is None:
            return False
        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        if tree is None:
            return False
        probes = self._collect_reflection_probes()
        if not probes:
            return False
        # Sync the WorldEnvironment (and install the skybox IBL on first use) BEFORE
        # the probe faces bind the forward descriptor set into *cmd*. set_skybox
        # updates that persistent set; doing it after a bind in the same recording
        # (as renderer.pre_render would, running after this) invalidates the command
        # buffer. sync is idempotent (guarded by handle id) so the regular pre_render
        # call right after is a no-op. It also means the probe faces capture against
        # the correct, already-installed environment from the very first frame.
        self._env_sync.sync_world_environment()
        # probe_blend_count == 0 disables probes entirely: skip capture
        # and keep the box SSBO at count 0 so the forward blend loop is a no-op
        # (covers probes baked before the dial was flipped; hash-gated upload).
        if self._probe_blend_count == 0:
            self._reflection_probe_pass.disable_probes()
            return False
        return self._reflection_probe_pass.update_probes(cmd, adapter, tree, probes)


    def _collect_irradiance_volumes(self) -> list[Any]:
        """Collect visible ``IrradianceVolume3D`` nodes from the scene tree."""
        from simvx.core.irradiance_volume import IrradianceVolume3D

        from ..scene_adapter import find_all_outside_subviewports

        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        root = getattr(tree, "root", None) if tree else None
        if root is None:
            return []
        return [n for n in find_all_outside_subviewports(root, IrradianceVolume3D) if getattr(n, "visible", True)]


[docs]
    def capture_irradiance_volumes(self, cmd: Any, adapter: Any) -> bool:
        """Bake budgeted irradiance-volume probes into the b18 SH SSBO.

        Driven from pre_render right after :meth:`capture_reflection_probes`, into
        the same primary *cmd*: the SH-reduce compute writes this frame's probes
        and a buffer barrier makes them visible to the main pass's uber reads, so
        a baked probe lights the scene the same frame. Flips
        ``_irradiance_volume_enabled`` (the uber SH gate) on when a volume is
        present; when none exist the flag stays off and the frame is
        byte-identical to a scene with no volume.
        """
        if not self._ready or self._irradiance_volume_pass is None:
            return False
        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        if tree is None:
            return False
        volumes = self._collect_irradiance_volumes()
        active = [v for v in volumes if getattr(v, "bake_mode", "once") != "disabled"]
        # The gate follows presence of an active volume (not the per-frame bake):
        # a "once" volume finishes baking after a few frames but must keep
        # contributing, so the flag stays on while the volume is in the tree.
        self._irradiance_volume_enabled = bool(active)
        if not active:
            return False
        return self._irradiance_volume_pass.update_volumes(cmd, adapter, tree, active)


    def _collect_decals(self) -> list[Any]:
        """Collect visible ``Decal3D`` nodes from the main scene tree."""
        from simvx.core.decal import Decal3D

        from ..scene_adapter import find_all_outside_subviewports

        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        root = getattr(tree, "root", None) if tree else None
        if root is None:
            return []
        return [n for n in find_all_outside_subviewports(root, Decal3D) if getattr(n, "visible", True)]

    def _resolve_decal_texture(self, node: Any, source: Any, cache_attr: str, colour_space: str) -> int:
        """Resolve a decal texture source to a bindless index, cached per node.

        ``source`` may be a path string or a numpy pixel array (both understood
        by ``TextureManager.resolve``). The index is cached on the node keyed by
        the source identity so a static decal uploads its texture once, not every
        frame. Returns -1 when there is no texture (empty source).
        """
        if source is None or (isinstance(source, str) and not source):
            return -1
        key = source if isinstance(source, str) else id(source)
        cached = getattr(node, cache_attr, None)
        if cached is not None and cached[0] == key:
            return cached[1]
        idx = self._engine.texture_manager.resolve(
            source, colour_space=colour_space, mipmaps=(colour_space == "srgb")
        )
        setattr(node, cache_attr, (key, idx))
        return idx

    def _pack_decals(self) -> None:
        """Pack visible ``Decal3D`` projectors into the b19 SSBO.

        Runs in pre_render (a host-visible SSBO upload, no command recording).
        A scene with no decal writes an all-zero buffer (``decal_count`` 0), so the
        uber projection loop stays a no-op and the frame is byte-identical: zero
        cost when unused. The box world TRS uses the node's world scale times its
        half-extents, so the inverse maps a world position inside the box into the
        cube [-1,1]^3 the shader tests.
        """
        if not self._ready:
            return
        from simvx.core.math.matrices import mat4_from_trs

        from .decal_pack import build_decal_buffer, compute_decal_tile_masks, pack_decal

        decals = self._collect_decals()
        if not decals and not self._decals_this_frame:
            return  # already clear; nothing to upload (stays zeroed / byte-identical)
        records: list[Any] = []
        models: list[np.ndarray] = []
        for node in decals:
            albedo_tex = self._resolve_decal_texture(node, node.texture, "_decal_albedo_slot", "srgb")
            normal_tex = self._resolve_decal_texture(node, node.normal_texture, "_decal_normal_slot", "linear")
            if albedo_tex < 0 and normal_tex < 0:
                continue  # no maps to project
            half = np.asarray(node.world_scale, dtype=np.float32) * np.asarray(node.size, dtype=np.float32)
            model = mat4_from_trs(node.world_position, node.world_rotation, half)
            models.append(model)
            records.append(
                pack_decal(
                    model,
                    node.modulate,
                    float(node.albedo_mix),
                    albedo_tex,
                    normal_tex,
                    float(node.normal_mix),
                    float(node.upper_fade),
                    float(node.lower_fade),
                    float(node.distance_fade_begin),
                    float(node.distance_fade_length),
                    int(node.cull_mask),
                )
            )

        # Standalone screen-tile decal cull: project each box to
        # a conservative screen AABB and set its bit in every tile it touches, so
        # the uber tests only the decals for a fragment's tile. Falls back to the
        # brute-force wire (grid_x=0) when there is no camera to project with.
        tile_masks = grid_x = grid_y = tile_px = None
        if records:
            vp = self._main_viewport_snapshot()
            if vp is not None:
                from .decal_pack import DECAL_TILE_PX

                width, height = self.internal_extent()
                grid_x, grid_y, tile_masks = compute_decal_tile_masks(
                    models, vp.camera_view, vp.camera_proj, width, height, DECAL_TILE_PX
                )
                tile_px = DECAL_TILE_PX

        if tile_masks is not None:
            buf = build_decal_buffer(
                records, tile_masks=tile_masks, grid_x=grid_x, grid_y=grid_y, tile_px=tile_px
            )
        else:
            buf = build_decal_buffer(records)
        self._buffers.write_decals(buf)
        self._decals_this_frame = bool(records)

    def _main_viewport_snapshot(self) -> Any:
        """The primary viewport's camera snapshot (view/proj), or None if unset.

        Used by the decal tile cull to project projector boxes into the
        same ``gl_FragCoord`` space the uber shades in. None selects the untiled
        brute-force fallback, so a camera-less frame stays byte-identical.
        """
        vpm = getattr(self, "viewport_manager", None)
        viewports = getattr(vpm, "viewports", None) if vpm is not None else None
        if not viewports:
            return None
        return viewports[0][1]

    # --- Renderer ABC ---


[docs]
    def init(self, device: Any, swapchain: Any) -> None:
        """Initialize (called by ABC contract: use setup() instead)."""
        self.setup()



[docs]
    def begin_frame(self) -> Any:
        """Begin frame: grow the transform arena if scheduled, then clear submission lists."""
        # Frame boundary (no recording in flight): safe to reallocate the transform
        # SSBO + slot-aligned resources before this frame records into them.
        if self._ready and self._buffers.maybe_grow():
            self._on_capacity_grown()
        self._buffers.begin_frame_arena()
        self._main_base = -1
        self._env_synced_this_frame = False
        # Reset the per-instance indirect batches ONCE per frame. They are shared
        # by every scene-render unit (SubViewports + main scene) recorded into the
        # single primary command buffer, so each SRU APPENDS its draws at the
        # batch's running offset (draw_range reads its own absolute offset) instead
        # of resetting mid-frame: a mid-frame reset+upload would overwrite the
        # indirect commands an earlier SRU's already-recorded draw reads at GPU
        # execution time. Occlusion batches are main-scene-only and reset in-pass.
        if self._batch is not None:
            self._batch.reset()
        if self._transparent_batch is not None:
            self._transparent_batch.reset()
        if self._velocity_batch is not None:
            self._velocity_batch.reset()
        self._instances.clear()
        self._particle_submissions.clear()
        self._gpu_particle_submissions.clear()
        self._billboard_submissions.clear()
        self._skinned_instances.clear()
        self._multimesh_blocks.clear()
        self._multimesh_draws.clear()
        self._velocity_mm_blocks.clear()
        # Frame-based static-upload-cache pruning: drop entries not touched by ANY
        # pass last frame (freed/hidden MultiMesh nodes), then roll the windows.
        self._mm_seen_last_frame = self._mm_seen_this_frame
        for dead in [k for k in self._mm_upload_cache if k not in self._mm_seen_last_frame]:
            del self._mm_upload_cache[dead]
        self._mm_seen_this_frame = set()
        if self._multimesh_batch is not None:
            self._multimesh_batch.reset()
        self._shader_material_submissions.clear()
        self._passes.begin_frame()
        return None  # Command buffer managed by engine


    def _on_capacity_grown(self) -> None:
        """Resize per-instance batches + visibility buffers after a transform-arena grow.

        The transform/AABB SSBOs were already reallocated by ``BufferManager``; the
        per-instance indirect batches and the occlusion ping-pong visibility
        buffers are slot-aligned to capacity and must keep pace. Lazily-created
        passes (velocity, occlusion, vis) are grown only if they already exist.
        Runs at the frame boundary, outside recording.
        """
        e = self._engine
        cap = self._buffers.max_objects
        if self._batch is not None:
            self._batch.grow(e.ctx.physical_device, cap)
        for batch in (self._velocity_batch, self._occ_batch_opaque, self._occ_batch_double, self._multimesh_batch):
            if batch is not None:
                batch.grow(e.ctx.physical_device, cap)
        # The transform SSBO was reallocated: every cached static upload is stale.
        self._mm_upload_cache.clear()
        # The velocity pass sizes its own cur/prev model SSBOs to the OLD capacity.
        # Drop it so _render_velocity lazily recreates it at the grown capacity
        # (TAA-only; nothing exists to rebuild when TAA never ran).
        if self._velocity_pass is not None:
            self._velocity_pass.cleanup()
            self._velocity_pass = None
            self._velocity_structure_version = -1
        # Visibility SSBOs are uint32[capacity]; drop them so _ensure_vis_buffers
        # reallocates at the new size and refills all-visible for one frame.
        if self._vis_buffers:
            for buf, mem in self._vis_buffers:
                if buf:
                    vk.vkDestroyBuffer(e.ctx.device, buf, None)
                if mem:
                    vk.vkFreeMemory(e.ctx.device, mem, None)
            self._vis_buffers = []
            self._occ_structure_version = -2


[docs]
    def submit_instance(
        self,
        mesh_handle: MeshHandle,
        transform: np.ndarray,
        material_id: int = 0,
        viewport_id: int = 0,
        render_layers: int = 0,
    ) -> None:
        """Submit a mesh instance for rendering this frame.

        ``render_layers`` is the instance's render-layer bitmask, uploaded into the
        Transform SSBO so the decal uber can enforce ``Decal3D.cull_mask``.
        0 means "all layers" (every decal received).
        """
        self._instances.append((mesh_handle, transform, material_id, viewport_id, render_layers))



[docs]
    def submit_multimesh(
        self,
        mesh_handle: MeshHandle,
        transforms: np.ndarray,
        material_id: int = 0,
        material_ids: np.ndarray | None = None,
        viewport_id: int = 0,
        count: int = 0,
        cache_key: int = 0,
        version: int = 0,
    ) -> None:
        """Bulk-submit many instances of the same mesh: avoids per-instance Python loops.

        Args:
            mesh_handle: Shared mesh for all instances.
            transforms: (N, 4, 4) float32 array of model matrices.
            material_id: Material index for all instances (ignored if *material_ids* given).
            material_ids: Optional (N,) uint32 array of per-instance material indices.
            viewport_id: Viewport index.
            count: Ignored here (transforms already sliced); accepted for ABI parity with web.
            cache_key / version: stable per-node id + content generation; let
                ``_upload_multimesh_blocks`` skip re-uploading an unchanged static block.
        """
        if transforms.shape[0] == 0:
            return
        # Store the whole block; the fast path uploads + draws it as one instanced
        # command. _prepare_multimesh decides per block whether the fast path
        # applies, expanding to per-instance _instances only when it can't.
        self._multimesh_blocks.append(
            (mesh_handle, transforms, material_id, material_ids, viewport_id, cache_key, version)
        )



[docs]
    def submit_shader_instance(
        self,
        mesh_handle: MeshHandle,
        transform: np.ndarray,
        material_id: int,
        shader_material: Any,
    ) -> None:
        """Submit a MeshInstance3D that carries a ShaderMaterial.

        The per-material pipeline is lazy-compiled + cached on first use via
        ShaderMaterialManager. During the forward draw, the renderer splits
        these submissions into their own bucket, binds the custom pipeline,
        updates the uniform buffer, and draws one mesh at a time, per-
        material pipeline switching is the cost of the custom-shader path.
        """
        self._shader_material_submissions.append(
            (mesh_handle, transform, material_id, shader_material),
        )



[docs]
    def submit_particles(self, particle_data: np.ndarray) -> None:
        """Submit particle data for rendering this frame."""
        self._particle_submissions.append((particle_data, len(particle_data)))



[docs]
    def msdf_atlas_slot(self) -> int:
        """Ensure the MSDF atlas is uploaded + bound in the bindless array; return its slot.

        Used by the Billboard2DPass collector (scene_adapter) to bake the atlas
        slot into Text3D glyph rows. Must be called OUTSIDE any render pass (it
        does staging transfers + ``vkUpdateDescriptorSets`` via
        ``register_texture`` / ``update_texture``); ``submit_scene`` is such a
        point. Returns -1 when there is no text pass or no atlas yet.

        The slot is reused across frames; if the atlas view is rebuilt (font /
        glyph-set change), the same slot is rewritten so baked references stay
        valid (assign-once, like the bindless-pass atlas slot).
        """
        tp = self._text_pass
        if tp is None:
            return self._msdf_atlas_slot
        # Ensure the shared MSDF font/atlas exists: in a pure-3D scene with only
        # Text3D, nothing else loads it, so upload_atlas_if_dirty would see no
        # atlas. _ensure_font is idempotent.
        from ..draw2d import Draw2D

        Draw2D._ensure_font()
        tp.upload_atlas_if_dirty()
        view = getattr(tp, "_atlas_view", None)
        if view is None:
            return self._msdf_atlas_slot
        if self._msdf_atlas_slot < 0:
            self._msdf_atlas_slot = self._engine.register_texture(view, filter="linear")
            self._msdf_atlas_view = view
        elif view != self._msdf_atlas_view:
            self._engine.update_texture(self._msdf_atlas_slot, view)
            self._msdf_atlas_view = view
        return self._msdf_atlas_slot



[docs]
    def submit_billboards(self, billboards: np.ndarray) -> None:
        """Submit depth-tested billboard instances (Sprite3D / Text3D) for this frame.

        ``billboards`` is a ``BILLBOARD_DTYPE`` array (one row per Sprite3D quad or
        Text3D glyph). All submissions are concatenated and drawn in one call by
        :class:`Billboard2DPass` inside ``render_scene_content``.
        """
        if billboards is not None and len(billboards) > 0:
            self._billboard_submissions.append(billboards)



[docs]
    def draw_sprite_3d(
        self, *, texture_id: int, position: Any, width: float, height: float,
        uv: tuple[float, float, float, float], colour: Any = (1.0, 1.0, 1.0, 1.0),
        billboard: bool = True, centered: bool = True, offset: tuple[float, float] = (0.0, 0.0),
        alpha_cut: str = "disabled", render_priority: int = 0,
    ) -> None:
        """Emit one Sprite3D billboard quad (called by ``Sprite3D.on_draw``).

        Builds a single ``BILLBOARD_DTYPE`` row: the quad is anchored at ``position``
        and sized ``width`` x ``height`` in world units on the camera-facing plane.
        ``centered`` keeps the quad on the anchor; otherwise its top-left corner sits
        on the anchor. The camera basis is applied GPU-side by Billboard2DPass.
        """
        from .billboard2d_pass import build_sprite3d_row

        row = build_sprite3d_row(
            texture_id=texture_id, position=position, width=width, height=height,
            uv=uv, colour=colour, centered=centered, offset=offset,
        )
        if row is not None:
            self._billboard_submissions.append(row)



[docs]
    def draw_text_3d(
        self, *, text: str, position: Any, font_scale: float = 1.0, pixel_size: float = 0.01,
        colour: Any = (1.0, 1.0, 1.0, 1.0), alignment: str = "centre",
        offset: tuple[float, float] = (0.0, 0.0),
    ) -> None:
        """Emit a Text3D MSDF glyph billboard run (called by ``Text3D.on_draw``).

        Reuses the unified glyph layout (:func:`layout_glyph_run`) so kerning /
        size match every other text node, then maps the pixel-space glyph quads
        into world units (``pixel_size``) on the camera-facing plane, centred about
        the anchor, and emits one MSDF billboard row per glyph (``is_msdf`` set so
        Billboard2DPass decodes glyph AA). The MSDF atlas must already be bound
        bindless (the collector ensured the slot outside the render pass).
        """
        from .billboard2d_pass import build_text3d_rows

        if not text:
            return
        # build_text3d_rows lays the run out FIRST (adding this run's glyphs to the
        # shared MSDF atlas), THEN invokes msdf_atlas_slot() so the GPU image has
        # them before sampling. Both happen outside the render pass (submit_scene),
        # so the staging upload + vkUpdateDescriptorSets in msdf_atlas_slot() is legal.
        rows = build_text3d_rows(
            text=text, position=position, resolve_atlas_slot=self.msdf_atlas_slot,
            font_scale=font_scale, pixel_size=pixel_size, colour=colour,
            alignment=alignment, offset=offset,
        )
        if rows is not None:
            self._billboard_submissions.append(rows)



[docs]
    def submit_gpu_particles(self, emitter_config: dict, *, emitter_id: int = 0) -> None:
        """Submit a GPU particle emitter config for compute-shader simulation this frame.

        ``emitter_id`` is a stable per-node identifier (``id(node) & 0xFFFFFFFF``
        from :class:`SceneAdapter`). :class:`ParticleCompute` keys its
        persistent SSBOs by this id so multi-emitter scenes render
        correctly: matches the web ``GPUParticlePass`` ownership model.
        """
        self._gpu_particle_submissions.append((int(emitter_id) & 0xFFFFFFFF, emitter_config))



[docs]
    def submit_light2d(self, **kwargs) -> None:
        """Submit a 2D light for this frame (forwarded to Light2DPass)."""
        if self._light2d_pass:
            self._light2d_pass.submit_light(**kwargs)



[docs]
    def submit_occluder2d(self, polygon_vertices: list[tuple[float, float]]) -> None:
        """Submit a 2D occluder polygon for shadow casting this frame."""
        if self._light2d_pass:
            self._light2d_pass.submit_occluder(polygon_vertices)



[docs]
    def submit_skinned_instance(
        self,
        mesh_handle: MeshHandle,
        transform: np.ndarray,
        material_id: int,
        joint_matrices: np.ndarray,
    ) -> None:
        """Submit a skinned mesh instance with joint matrices for this frame."""
        self._skinned_instances.append((mesh_handle, transform, material_id, joint_matrices))



[docs]
    def install_packet(self, packet: Any) -> None:
        """Bind a :class:`RenderPacket`'s owned snapshot onto the per-frame state.

        Called on the RENDER THREAD (under ``_frame_state_lock``) immediately
        before recording a pipelined GPU frame. It rebinds the renderer's
        per-frame submission attributes to the packet's OWNED copies and rebuilds
        ``viewport_manager`` from the packet's viewport snapshots, so the existing
        ``_upload_transforms`` / ``pre_render`` / ``render`` body records from the
        snapshot instead of from live state the main thread is concurrently
        rebuilding. This is the same temporary-rebind pattern ``render_to_target``
        uses for offscreen SRUs (scene_adapter ~1046), here for the whole frame.

        The renderer's own ``set_materials`` / ``set_lights`` still own the GPU
        material/light SSBO uploads; this only restores the CPU-side arrays the
        recording reads, so passes that re-read ``_materials`` / ``_lights`` see
        the snapshot. (Those SSBOs are written by ``pre_render``/``_upload`` on
        this same render thread, so a single GPU buffer remains correct.)
        """
        self._instances = packet.instances
        self._skinned_instances = packet.skinned_instances
        self._shader_material_submissions = packet.shader_material_submissions
        self._particle_submissions = packet.particle_submissions
        self._gpu_particle_submissions = packet.gpu_particle_submissions
        self._billboard_submissions = packet.billboard_submissions
        # MultiMesh blocks are resolved (expanded/uploaded) by pre_render on this
        # render thread from the snapshot; clear the derived draw list so it is
        # rebuilt per packet rather than carried over.
        self._multimesh_blocks = packet.multimesh_blocks
        self._multimesh_draws = []
        self._materials = packet.materials
        self._lights = packet.lights
        # Bind the packet's owned subsystem submission snapshots so the tilemap /
        # 2D-light passes read these instead of their live singleton lists (which
        # the main thread is concurrently clearing + rebuilding). The pass
        # ``render`` sites consult these overrides when set.
        self._packet_tilemap_layers = packet.tilemap_layers
        self._packet_light2d_lights = packet.light2d_lights
        self._packet_light2d_occluders = packet.light2d_occluders
        # Bind the active Camera2D mapping so the render-thread particle overlay
        # billboards through the sprites' canvas_transform (None on 3D/no-camera).
        self._camera2d_affine = getattr(packet, "camera2d_affine", None)
        # Bind the ordered SubViewport SRU plans so ``pre_render`` replays them
        # from the snapshot (no live-tree walk on the render thread).
        self._packet_subviewport_srus = packet.subviewport_srus
        # Bind the published 2D item view (``None`` when the scene drew no 2D this
        # frame). The camera affine rides on the packet alongside it.
        self._packet_item_view = packet.item_view
        self._packet_item_camera = getattr(packet, "item_camera", (1.0, 0.0, 0.0, 1.0, 0.0, 0.0))
        vpm = self.viewport_manager
        vpm.clear()
        for snap in packet.viewports:
            vpm.create_viewport(
                snap.x,
                snap.y,
                snap.width,
                snap.height,
                snap.camera_view,
                snap.camera_proj,
                snap.render_target,
            )



[docs]
    def tilemap_layers(self) -> list:
        """Tilemap submissions the render pass should draw this frame.

        Returns the packet's owned snapshot when a packet is installed (pipelined
        render thread), else the live ``_tilemap_pass._submissions`` list
        (synchronous path, byte-identical). ``[]`` when there is no tilemap pass.
        """
        if self._packet_tilemap_layers is not None:
            return self._packet_tilemap_layers
        tm = self._tilemap_pass
        return tm._submissions if tm is not None else []



[docs]
    def light2d_submissions(self) -> tuple[list, list]:
        """``(lights, occluders)`` the 2D-light pass should render this frame.

        Packet-owned snapshots in pipelined mode, else the live
        ``_light2d_pass`` lists (synchronous path, byte-identical). ``([], [])``
        when there is no 2D-light pass.
        """
        if self._packet_light2d_lights is not None:
            return self._packet_light2d_lights, (self._packet_light2d_occluders or [])
        l2d = self._light2d_pass
        if l2d is None:
            return [], []
        return l2d._lights, l2d._occluders



[docs]
    def reserve_main_slice(self) -> int:
        """Reserve the main scene's transform-SSBO slice at the front of the frame.

        Called before any offscreen scene-render unit (SubViewport / probe face)
        reserves, so the main scene is the FIRST SRU and keeps base 0: its absolute
        ``first_instance`` indices equal the local indices (unchanged single-scene
        behaviour). Offscreen SRUs then reserve slices after it. Idempotent within a
        frame (the arena cursor only advances on the first call). Returns the base.
        """
        n_slots = len(self._instances) + len(self._skinned_instances)
        self._main_base = self._buffers.reserve_slots(n_slots)
        return self._main_base


    def _upload_transforms(self) -> None:
        """Upload ALL main-scene instance transforms into the main slice once per frame.

        The main scene's slice is reserved by :meth:`reserve_main_slice` (base 0,
        before offscreen SRUs). Skinned instances occupy the slots immediately after
        the opaque ones within that slice. This drives the high-water + overflow/grow
        logic via the reservation.

        When TAA is enabled, also capture the column-major model matrices so the
        velocity pass can plumb them through as current/previous-frame transforms.
        """
        # Reserve here too if the orchestrator did not pre-reserve (headless /
        # tests that call _upload_transforms directly): keeps the main scene base 0.
        if self._main_base < 0:
            self.reserve_main_slice()
        models = self._buffers.upload_transforms(
            self._instances,
            upload_aabbs=self._occlusion_culling_enabled,
            base=self._main_base,
        )
        self._main_base = -1
        pp = self._post_process
        self._velocity_models = models if (pp and pp.taa_enabled) else None

    def _has_shadow_casters(self) -> bool:
        """True when any uploaded light casts shadows (``params[3] >= 0.5``).

        Used to decide whether a MultiMesh-only scene needs to run the shadow
        passes at all: with no caster, skipping them avoids a per-frame
        shadow-SSBO map/upload that would otherwise cost nothing useful.
        """
        lights = self._lights
        if lights is None or len(lights) == 0:
            return False
        return bool(np.any(lights["params"][:, 3] >= 0.5))

    def _prepare_multimesh(self) -> None:
        """Decide per MultiMesh block whether the vectorized fast path applies.

        Eligible blocks (opaque material) stay as blocks and are uploaded/drawn
        vectorized; the shadow, occlusion, and velocity passes are extended to
        consume ``_multimesh_draws`` directly. Transparent blocks are the only
        ones expanded (depth sorting is inherently per-instance).  Must run
        BEFORE ``_upload_transforms`` so any expanded instances are counted.
        """
        if not self._multimesh_blocks:
            return
        mats = self._materials
        nmat = len(mats) if mats is not None else 0

        kept: list[tuple[MeshHandle, np.ndarray, int, np.ndarray | None, int, int, int]] = []
        for block in self._multimesh_blocks:
            mesh_handle, transforms, material_id, material_ids, viewport_id, _key, _ver = block
            # Per-instance colour blocks use auto-generated opaque materials.
            opaque_block = material_ids is not None or not (0 <= material_id < nmat)
            # Only blended materials need per-instance depth sorting; alpha
            # cutoff (alpha_mode 2) stays in the opaque pass (shader discard).
            transparent = not opaque_block and mats[material_id]["alpha_mode"] == ALPHA_BLEND
            if not transparent:
                kept.append(block)
                continue
            # Transparent: must expand for back-to-front depth sorting.
            n = transforms.shape[0]
            # MultiMesh render layers are not plumbed per-block yet; 0 = all layers
            # keeps the "everything receives every decal" behaviour.
            if material_ids is not None:
                for i in range(n):
                    self._instances.append((mesh_handle, transforms[i], int(material_ids[i]), viewport_id, 0))
            else:
                for i in range(n):
                    self._instances.append((mesh_handle, transforms[i], material_id, viewport_id, 0))
        self._multimesh_blocks = kept

    def _upload_multimesh_blocks(self) -> None:
        """Reserve slots + vectorized-upload each eligible MultiMesh block.

        Runs AFTER ``_upload_transforms`` so blocks take slots after the main
        per-instance slice. Records draw entries (slot range, world centres +
        radii for per-viewport frustum culling, union AABB for coarse occlusion
        culling) consumed by the scene renderer and shadow/velocity passes.

        When occlusion culling is enabled each block's UNION world-AABB is
        written into the AABB SSBO at slot ``base`` (the first slot of the block).
        The occlusion compute reads ``aabb[first_instance]`` per command, so one
        command per block with ``first_instance=base`` tests the union -- coarse,
        all-or-nothing, matching the Godot/Unreal MultiMesh semantics.
        """
        if self._multimesh_draws:  # already resolved this frame (idempotent guard)
            return
        from ..gpu.memory import upload_numpy
        from ..types import AABB_DTYPE

        occ = self._occlusion_culling_enabled
        pp = self._post_process
        taa = bool(pp and getattr(pp, "taa_enabled", False))
        mats = self._materials
        nmat = len(mats) if mats is not None else 0
        for mesh_handle, transforms, material_id, material_ids, vp_id, cache_key, version in self._multimesh_blocks:
            double_sided = bool(material_ids is None and 0 <= material_id < nmat and mats[material_id]["double_sided"])
            n = int(transforms.shape[0])
            base = self._buffers.reserve_slots(n)

            # Static upload-skip: an unchanged block (same version) that lands on the
            # same slots it already occupies keeps its transforms resident in the SSBO,
            # so the per-frame pack + upload is skipped. Cull centres/radii are reused.
            cached = self._mm_upload_cache.get(cache_key) if cache_key else None
            if cached is not None and cached[0] == version and cached[1] == base and cached[2] == n:
                _v, _b, count, centres, radii, union_min, union_max, resident = cached
                # The transform SSBO is ringed across frames in flight, so a block
                # cached from a DIFFERENT frame slot is not yet resident in THIS
                # frame's buffer: re-upload it there (metadata is frame-independent
                # and reused). Once every slot has it, all frames skip the upload.
                if self._engine.current_frame not in resident:
                    self._buffers.upload_transform_block(
                        transforms, base, material_index=material_id, material_ids=material_ids
                    )
                    resident.add(self._engine.current_frame)
            else:
                centres = self._buffers.upload_transform_block(
                    transforms, base, material_index=material_id, material_ids=material_ids
                )
                if centres is None:
                    continue
                count = int(centres.shape[0])  # may be clamped to remaining capacity
                centres = centres.astype(np.float32)
                # Per-instance world radius = mesh bound radius * max column scale.
                m3 = np.ascontiguousarray(transforms[:count, :3, :3], dtype=np.float32)
                col_norms_sq = np.sum(m3**2, axis=1)  # (count, 3): sum over rows = column norms²
                max_scale = np.sqrt(np.max(col_norms_sq, axis=1))  # (count,)
                radii = (float(mesh_handle.bounding_radius) * max_scale).astype(np.float32)
                # Compute union world-space AABB for coarse occlusion culling.
                # Transform the mesh LOCAL AABB corners by each instance matrix,
                # then take the min/max across all transformed corners.
                lo = np.asarray(mesh_handle.aabb_min, dtype=np.float32)
                hi = np.asarray(mesh_handle.aabb_max, dtype=np.float32)
                corners_local = np.array(
                    [
                        [lo[0], lo[1], lo[2], 1],
                        [hi[0], lo[1], lo[2], 1],
                        [lo[0], hi[1], lo[2], 1],
                        [hi[0], hi[1], lo[2], 1],
                        [lo[0], lo[1], hi[2], 1],
                        [hi[0], lo[1], hi[2], 1],
                        [lo[0], hi[1], hi[2], 1],
                        [hi[0], hi[1], hi[2], 1],
                    ],
                    dtype=np.float32,
                )  # (8, 4)
                mats_w = np.ascontiguousarray(transforms[:count], dtype=np.float32)  # (count, 4, 4)
                # world corners: (count, 8, 3) = mats_w[:, :3, :] @ corners_local.T
                world_corners = (mats_w[:, :3, :] @ corners_local.T).transpose(0, 2, 1)  # (count, 8, 3)
                union_min = world_corners.reshape(-1, 3).min(axis=0).astype(np.float32)
                union_max = world_corners.reshape(-1, 3).max(axis=0).astype(np.float32)
                if cache_key:
                    resident = {self._engine.current_frame}
                    self._mm_upload_cache[cache_key] = (
                        version, base, count, centres, radii, union_min, union_max, resident
                    )

            # TAA velocity: capture col-major models at their absolute slot range so the
            # velocity shader reads the correct prev/cur pair by gl_InstanceIndex.
            if taa:
                mm_t = np.ascontiguousarray(transforms[:count], dtype=np.float32)
                self._velocity_mm_blocks.append((base, np.ascontiguousarray(mm_t.transpose(0, 2, 1))))

            # When occlusion culling is active, write the union AABB into the AABB SSBO
            # at slot ``base`` so the compute shader reads it for the block's one command.
            if occ and self._buffers.aabb_mem is not None and union_min is not None:
                aabb_entry = np.zeros(1, dtype=AABB_DTYPE)
                aabb_entry[0]["aabb_min"][:3] = union_min
                aabb_entry[0]["aabb_max"][:3] = union_max
                upload_numpy(
                    self._engine.ctx.device,
                    self._buffers.aabb_mem,
                    aabb_entry,
                    byte_offset=base * AABB_DTYPE.itemsize,
                )

            if cache_key:
                self._mm_seen_this_frame.add(cache_key)
            self._multimesh_draws.append(
                (mesh_handle, base, count, vp_id, double_sided, centres, radii, union_min, union_max)
            )

    def _structure_version(self) -> int:
        """Scene-tree structure version (bumps on add/remove/reparent); -1 if unknown."""
        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        return int(getattr(tree, "_structure_version", -1)) if tree else -1

    def _render_velocity(self, cmd: Any, pp: PostProcessPass) -> None:
        """Record the per-object velocity pass (TAA only).

        Lazily creates the :class:`VelocityPass` and its dedicated indirect batch
        on the first TAA frame, uploads current/previous model matrices (prev ==
        cur on the first frame or after a tree-structure change), sets the
        UNJITTERED cur/prev VP, and re-draws opaque instances + MultiMesh blocks
        into the RG16F velocity target. Nothing here runs while TAA is off.
        """
        has_per_instance = self._velocity_models is not None
        has_mm = bool(self._velocity_mm_blocks)
        if not has_per_instance and not has_mm:
            return
        if self._velocity_pass is None:
            from .velocity_pass import VelocityPass

            e = self._engine
            # Velocity is an HDR-space pass: sized to the internal extent.
            w, h = self.internal_extent()
            self._velocity_pass = VelocityPass(e, self._max_objects)
            self._velocity_pass.setup(w, h)
            use_mdi = getattr(e, "_has_mdi", True)
            self._velocity_batch = GPUBatch(
                e,
                e.ctx.device,
                e.ctx.physical_device,
                max_draws=self._max_objects,
                use_mdi=use_mdi,
            )

        version = self._structure_version()
        structure_changed = version != self._velocity_structure_version
        self._velocity_structure_version = version

        vpass = self._velocity_pass
        vpass.set_frame_matrices(pp.taa_cur_vp, pp.taa_prev_vp)
        if self._velocity_models is not None:
            vpass.upload_models(self._velocity_models, structure_changed)
        # Upload MultiMesh block transforms into the velocity cur/prev SSBOs at
        # their absolute slot offsets so the velocity shader indexes them correctly.
        for mm_base, mm_models_T in self._velocity_mm_blocks:
            vpass.upload_model_block(mm_models_T, mm_base, structure_changed)
        vpass.render(cmd, self._scene_renderer)

    def _ensure_depth_prepass(self) -> Any:
        """Lazily build the scratch depth prepass on the first occlusion-enabled frame.

        Owns a full-res D32 samplable depth target the predicted occluders (set A)
        are drawn into; the mid-frame Hi-Z build reduces it. Only ever called from
        inside the ``_occlusion_culling_enabled`` guard.
        """
        if self._depth_prepass is None:
            from .depth_prepass import DepthPrepass

            e = self._engine
            # Scratch depth matches the HDR chain's internal extent.
            w, h = self.internal_extent()
            dp = DepthPrepass(e)
            dp.setup(w, h, self._buffers.ssbo_layout)
            self._depth_prepass = dp
        return self._depth_prepass

    def _ensure_hiz_pass(self) -> Any:
        """Lazily build the Hi-Z pyramid pass targeting the SCRATCH depth.

        Two-phase: the pyramid is built MID-FRAME from the scratch depth prepass
        (predicted occluders), not the HDR depth at end of frame. Only ever called
        from inside the ``_occlusion_culling_enabled`` guard, so the default
        (occlusion-off) path allocates nothing.
        """
        hiz = self._passes.hiz_pass
        if hiz is None:
            from .hiz_pass import HiZPass

            e = self._engine
            # Pyramid reduces the internal-extent scratch depth.
            w, h = self.internal_extent()
            dp = self._depth_prepass
            hiz = HiZPass(e)
            hiz.setup(w, h, dp.depth_view, dp.depth_image)
            self._passes.hiz_pass = hiz
        return hiz

    def _ensure_occlusion(self) -> Any:
        """Lazily create the two-phase occlusion-cull pass + its two indirect batches.

        Only ever called from inside the ``_occlusion_culling_enabled`` guard, so
        the default (occlusion-off) path allocates nothing.
        """
        if self._occlusion_pass is None:
            from .occlusion_cull import OcclusionTwoPhasePass

            e = self._engine
            self._occlusion_pass = OcclusionTwoPhasePass(e)
            self._occlusion_pass.setup()
            use_mdi = getattr(e, "_has_mdi", True)
            # These two indirect buffers carry the ``instance_count`` the
            # occlusion compute patches. The two-phase occlusion + Hi-Z island
            # stays on the GRAPHICS queue in both sync and async modes (it
            # interleaves a graphics depth-prepass between compute phases), so it
            # never crosses queues: EXCLUSIVE is correct and avoids the
            # compression loss CONCURRENT would impose. (If occlusion later moves
            # to the compute queue, mark these CONCURRENT then.)
            self._occ_batch_opaque = GPUBatch(
                e,
                e.ctx.device,
                e.ctx.physical_device,
                max_draws=self._max_objects,
                use_mdi=use_mdi,
            )
            self._occ_batch_double = GPUBatch(
                e,
                e.ctx.device,
                e.ctx.physical_device,
                max_draws=self._max_objects,
                use_mdi=use_mdi,
            )
        return self._occlusion_pass

    def _ensure_vis_buffers(self) -> tuple[Any, Any, Any, Any, bool]:
        """Lazily allocate the two ping-pong visibility SSBOs; refill on (re)init.

        Returns ``(vis_prev_buf, vis_next_buf, vis_prev_mem, vis_next_mem,
        refill_all)`` for the CURRENT parity. ``refill_all`` is True on the first
        enabled frame, after a resize, or after a scene-structure change: in that
        case vis_prev was just filled with 1 (all visible) so phase 1 draws
        everything and phase 2 culls nothing (conservative, never drops).
        """
        from ..gpu.memory import create_buffer

        e = self._engine
        device = e.ctx.device
        vis_size = self._max_objects * 4  # uint32 per slot
        host_flags = vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
        if not self._vis_buffers:
            # Visibility ping-pong SSBOs are written by the occlusion compute,
            # which stays on the graphics queue (see _ensure_occlusion_pass), so
            # they never cross queues: EXCLUSIVE in both modes.
            for _ in range(2):
                buf, mem = create_buffer(
                    device,
                    e.ctx.physical_device,
                    vis_size,
                    vk.VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
                    host_flags,
                )
                self._vis_buffers.append((buf, mem))

        version = self._structure_version()
        refill_all = self._occ_structure_version != version or not self._hiz_built_once
        self._occ_structure_version = version

        prev_buf, prev_mem = self._vis_buffers[self._vis_parity]
        next_buf, next_mem = self._vis_buffers[1 - self._vis_parity]
        if refill_all:
            # Mark every slot visible so set A == all instances this frame.
            ones = np.ones(self._max_objects, dtype=np.uint32)
            upload_numpy(device, prev_mem, ones)
        return prev_buf, next_buf, prev_mem, next_mem, refill_all

    def _two_phase_occlusion(self, cmd: Any, viewports: Any, pool: Any) -> None:
        """Record the full two-phase occlusion cull (gated by the caller on the toggle).

        Ensures the depth prepass, Hi-Z pass, two-phase compute pass and the two
        ping-pong visibility buffers, then drives the four steps documented at the
        call site. Parity flips at the end so this frame's vis_next becomes next
        frame's vis_prev (set A).
        """
        if not viewports:
            return
        self._ensure_depth_prepass()
        self._ensure_occlusion()
        self._ensure_hiz_pass()
        vis_prev, vis_next, vis_prev_mem, _nm, _refill = self._ensure_vis_buffers()
        skip_cull = not self._hiz_built_once

        if pool:
            pool.begin(cmd, "occlusion_phase1")
        # Phase 1: build the final batches from frustum-visible instances and seed
        # set A (also uploads vis buffers' visibility into vis_next via the shader).
        self._scene_renderer.two_phase_select(cmd, viewports, vis_prev, vis_next, vis_prev_mem)
        if pool:
            pool.end(cmd, "occlusion_phase1")

        # Phase 1 depth prepass: draw set A (depth-only) into the scratch depth.
        if pool:
            pool.begin(cmd, "depth_prepass")
        _, vp = viewports[0]
        view_proj_T = np.ascontiguousarray((vp.camera_proj @ vp.camera_view).T, dtype=np.float32)
        self._depth_prepass.render(cmd, self._scene_renderer, view_proj_T)
        if pool:
            pool.end(cmd, "depth_prepass")

        # Mid-frame Hi-Z build from the scratch depth (compute island).
        if pool:
            pool.begin(cmd, "hiz")
        self._ensure_hiz_pass().render(cmd)
        self._hiz_built_once = True
        if pool:
            pool.end(cmd, "hiz")

        # Phase 2: cull set B against the fresh pyramid; patch the final batch.
        if pool:
            pool.begin(cmd, "occlusion_phase2")
        self._scene_renderer.two_phase_cull(cmd, viewports, vis_prev, vis_next, skip_cull)
        if pool:
            pool.end(cmd, "occlusion_phase2")

        # Flip parity: this frame's vis_next is next frame's vis_prev (set A).
        self._vis_parity = 1 - self._vis_parity


[docs]
    def sync_render_state(self) -> None:
        """Propagate ``WorldEnvironment`` state and run feature activations.

        Split out of :meth:`pre_render` so the frame loop can run it BEFORE it
        records offscreen RenderView / SubViewport draws. The G-buffer / SSR /
        SSGI activation inside ``sync_world_environment`` may ``vkDeviceWaitIdle``
        and rebuild the HDR-pass pipelines; doing that after an offscreen target
        has already recorded pipeline binds into the open command buffer would
        invalidate that buffer (a hard crash when SSR is combined with a live
        ``PlanarReflection3D``). This records nothing into a command buffer, so
        hoisting it is safe.

        Guarded to run at most once per frame (reset in :meth:`begin_frame`): the
        earliest caller performs the sync, and the later :meth:`pre_render` call is
        a no-op, so there is exactly one activation transition per frame and it
        always precedes any offscreen pipeline binding.
        """
        if not self._ready or self._env_synced_this_frame:
            return
        self._env_synced_this_frame = True
        self._env_sync.sync_world_environment()



[docs]
    def pre_render(self, cmd: Any) -> None:
        """Record offscreen passes (shadow maps, HDR) before main render pass begins."""
        if not self._ready:
            return

        # Sync WorldEnvironment properties to renderer (idempotent per frame; the
        # frame loop usually hoists this ahead of offscreen recording, see
        # sync_render_state, so here it is normally a guarded no-op).
        self.sync_render_state()

        # Repack the per-frame FrameGlobals UBO. ``sync_world_environment``
        # has already written ``_frame_globals_env`` from the WorldEnvironment (calm
        # defaults when there is none), so this just needs the scene clock + extent.
        tree = getattr(self._engine, "_scene_tree", None) or getattr(self._engine, "scene_tree", None)
        now = float(tree.now) if tree is not None else 0.0
        # Internal (HDR-chain) extent + output extent + scale.
        # internal_extent() is identity when post is off, so the packed scale
        # must match (shaders are never told about an upscale that isn't there).
        internal = self.internal_extent()
        output = self._engine.extent
        scale = self._render_scale if tuple(internal) != tuple(output) else 1.0
        self._buffers.update_frame_globals(now, self._frame_globals_env, internal, output, scale)

        # Scene-read scheduling: a screen-reading material this
        # frame (features bit set) triggers the opaque/transparent pass split +
        # scene colour/depth copy. Derived from ``_materials`` (which rides the
        # RenderPacket in pipelined mode), so it works identically in immediate
        # and pipelined recording. Zero-cost when absent: ``.any()`` over the
        # features column is a single vectorized scan and the split never fires.
        from ..types import Feature

        # Collect + submit water surfaces. A visible WaterSurface3D
        # refracts the scene behind it, so its presence (like a screen-reading
        # material) forces the opaque/transparent pass split + scene copy.
        self._submit_water_surfaces()

        # Collect + submit FFT ocean surfaces. Like water, an ocean
        # refracts the scene behind it, so its presence forces the split. The GPU
        # FFT compute chain (spectrum -> Stockham IFFT -> assemble) runs
        # HERE (before the main render pass begins: compute writes to the sampled
        # displacement/slope arrays are illegal inside a render pass), driven by
        # the same scene clock + wind.
        self._submit_ocean_surfaces()
        op = self._ocean_pass
        if op is not None and self._ocean_this_frame:
            op.prepare(cmd, self._env_sync.cached_env, now)

        # Pack visible Decal3D projectors into the b19 SSBO. A
        # host-visible upload only (no command recording); with no decal it writes
        # a zeroed buffer so the uber loop is a no-op and the frame is byte-identical.
        self._pack_decals()

        mats = self._materials
        self._scene_read_this_frame = bool(
            self._water_this_frame
            or self._ocean_this_frame
            or (mats.size and (mats["features"] & int(Feature.HAS_SCREEN_READ)).any())
        )

        # GPU timing pool (None if device lacks timestamp queries).
        pool = self._engine.current_timestamp_pool

        # Dispatch GPU particle compute shaders (must happen outside render pass)
        if self._gpu_particle_submissions:
            if pool:
                pool.begin(cmd, "particle_compute")
            self._overlay_renderer.dispatch_gpu_particles(cmd)
            if pool:
                pool.end(cmd, "particle_compute")

        # Resolve MultiMesh blocks (expand fallbacks into _instances) BEFORE the
        # main upload so expanded instances are counted + uploaded with the rest.
        self._prepare_multimesh()

        # Upload all transforms ONCE: shared by shadow and main passes
        self._upload_transforms()

        # Upload the eligible MultiMesh blocks into slots after the main slice.
        self._upload_multimesh_blocks()

        # Upload MSDF atlas outside render pass (staging transfers not allowed inside)
        # Single upload serves both TextPass (3D overlay) and Draw2DPass (2D UI text)
        if self._text_pass:
            self._text_pass.upload_atlas_if_dirty()

        # Register / refresh the MSDF atlas in the bindless ui_textures[] array
        # HERE (outside the render pass), so glyph items can reference it as a
        # per-vertex tex_id. register_texture/update_texture call vkUpdateDescriptorSets,
        # which must NOT happen while a command buffer using the set is recording -- so
        # it cannot live in the submitter's render(). Only when the item path is active.
        self._frame_post_bands = []
        if self._packet_item_view is not None:
            self._ensure_item_submitter()  # creates the bindless pass on first use
            if self._bindless_pass is not None:
                self._bindless_pass.sync_atlas_slot()
            # The HDR-lane pass shares the engine's bindless texture array, so it
            # mirrors the swapchain pass's atlas slot rather than re-registering it.
            if self._ensure_hdr_item_submitter() is not None and self._bindless_pass is not None:
                self._hdr_bindless_pass.set_atlas_slot(self._bindless_pass.atlas_slot)

        # Per-CanvasLayer post: render each opted-in band that drew
        # this frame into its OWN HDR target + bloom, HERE (outside any render pass:
        # each band chain opens its own offscreen pass). The composite onto the
        # swapchain happens in ``render`` at the band's z-slot. ``_present_post_bands``
        # is ``[]`` (one dict-empty check) for every scene with no per-layer
        # environment, so this whole block is skipped and the frame is byte-identical.
        self._frame_post_bands = self._present_post_bands()
        if self._frame_post_bands:
            self._reap_stale_layer_post_chains(set(self._layer_post_specs.keys()))
            win = self._engine._window
            ws = win.get_window_size() if win and hasattr(win, "get_window_size") else None
            ui_size = (ws[0], ws[1]) if ws else (0, 0)
            atlas_slot = self._bindless_pass.atlas_slot if self._bindless_pass is not None else -1
            for band in self._frame_post_bands:
                chain = self._ensure_layer_post_chain(band)
                chain.sync_atlas_slot(atlas_slot)
                chain.render_band_hdr(cmd, self._packet_item_view, self._packet_item_camera, ui_size)

        # Track whether HDR content was rendered this frame
        self._hdr_rendered = False

        # Update IBL flag in shadow buffer (even without shadow pass). Also runs
        # when an irradiance volume is active so its SH gate (offset 364) reaches
        # the GPU in a shadow-free, sky-free scene.
        if (self._ibl_enabled or self._irradiance_volume_enabled) and not self._shadow_pass:
            shadow_data = np.zeros(SHADOW_DATA_SIZE, dtype=np.uint8)
            sentinel = np.array([0xFF, 0xFF, 0xFF, 0xFF], dtype=np.uint8)
            shadow_data[208:212] = sentinel
            shadow_data[220:224] = sentinel
            shadow_data[224:228] = sentinel
            shadow_data[212:216] = np.array([1 if self._ibl_enabled else 0], dtype=np.uint32).view(np.uint8)
            # bytes 216-219 (_reserved_hdr) intentionally left zero: hdr_output is a
            # per-SRU push constant now, not a shadow-SSBO field.
            shadow_data[336:352] = _DEFAULT_AMBIENT.view(np.uint8)
            # Pluggable ambient tier at offset 352; the indirect
            # hook gates (356/360) stay zero until an SSR / SSGI producer exists.
            shadow_data[352:356] = np.array([self._ambient_mode], dtype=np.uint32).view(np.uint8)
            # Irradiance-volume SH gate at offset 364.
            shadow_data[364:368] = np.array(
                [1 if self._irradiance_volume_enabled else 0], dtype=np.uint32
            ).view(np.uint8)
            # Debug view mode at offset 368; 0 (off) is default.
            shadow_data[368:372] = np.array([self._debug_view], dtype=np.uint32).view(np.uint8)
            self._buffers.write_shadow_data(shadow_data)
        # Render 2D lights to accumulation texture. The submissions come from the
        # packet snapshot in pipelined mode, else the live pass lists.
        l2d_lights, l2d_occluders = self.light2d_submissions()
        if self._light2d_pass and l2d_lights:
            if pool:
                pool.begin(cmd, "light2d")
            self._light2d_pass.render(cmd, self._engine.extent, lights=l2d_lights, occluders=l2d_occluders)
            if pool:
                pool.end(cmd, "light2d")

        # Per-instance geometry runs the shadow passes as before (they internally
        # no-op when no light casts). MultiMesh blocks only TRIGGER the shadow passes
        # when a caster actually exists -- otherwise a MultiMesh-only scene would pay
        # the per-frame shadow-SSBO map/upload for nothing (it has no _instances). The
        # latch keeps them running for one extra frame after the last caster is removed
        # so the shadow SSBO is cleared rather than left stale.
        mm_shadow_now = bool(self._multimesh_draws) and self._has_shadow_casters()
        mm_shadow = mm_shadow_now or self._mm_shadow_active
        self._mm_shadow_active = mm_shadow_now
        if self._shadow_pass and (self._instances or mm_shadow):
            if pool:
                pool.begin(cmd, "shadow")
            self._shadow_renderer.render_shadows(cmd, self._engine.mesh_registry)
            if pool:
                pool.end(cmd, "shadow")
        if self._point_shadow_pass and (self._instances or mm_shadow):
            if pool:
                pool.begin(cmd, "point_shadow")
            self._shadow_renderer.render_point_spot_shadows(cmd, self._engine.mesh_registry)
            if pool:
                pool.end(cmd, "point_shadow")

        # When post-processing is enabled, render 3D scene to HDR target here.
        # Also enter the HDR pass for scenes with no 3D mesh instances but that
        # still have content that renders through render_scene_content, e.g.
        # tilemap-only scenes or skybox + particles, otherwise nothing is
        # ever drawn and the tonemap samples an undefined HDR target.
        pp = self._post_process
        has_scene_content = bool(
            self._instances
            or self._skinned_instances
            or self._multimesh_draws
            or self._shader_material_submissions
            or (self._tilemap_pass and self.tilemap_layers())
            or (self._particle_pass and self._particle_submissions)
            or self._gpu_particle_submissions
            or (self._billboard2d_pass and self._billboard_submissions)
            or (self._skybox_pass and getattr(self._skybox_pass, "enabled", False))
        )
        # Pure-2D bloom: also enter the HDR pass for a 2D-only scene that opted into
        # bloom (WorldEnvironment with bloom_enabled). ``_wants_2d_post`` is only
        # evaluated when ``has_scene_content`` is False (``or`` short-circuits for
        # any 3D scene), so the 3D path and every post-off / no-env scene are
        # untouched. ``_hdr_2d_only`` flags a frame that carried only 2D so the
        # tonemap blit can use linear (no ACES crush of flat art).
        self._hdr_2d_only = not has_scene_content
        if pp and pp.enabled and (has_scene_content or self._wants_2d_post()):
            # Update camera matrices in UBO (needed by motion blur AND fog depth reconstruction)
            viewports = self.viewport_manager.viewports
            if viewports:
                _, vp = viewports[0]
                pp.update_motion_blur_matrices(vp.camera_view, vp.camera_proj)

            # Two-phase GPU Hi-Z occlusion cull. Runs HERE: outside any render pass
            # (compute is illegal inside one) and BEFORE the camera proj is TAA-
            # jittered, so it uses the UNJITTERED proj. All steps are gated on the
            # toggle; when off nothing here is allocated or recorded and the frame
            # is the existing single HDR pass (byte-identical).
            #   1. phase-1 SELECTION (compute): seed the final batch to set A only
            #      (instances visible last frame) and seed vis_next = A.
            #   2. scratch DEPTH PREPASS: draw set A depth-only -> scratch depth.
            #   3. mid-frame HI-Z BUILD: reduce the scratch depth into the pyramid.
            #   4. phase-2 CULL (compute): test set B against the FRESH pyramid,
            #      patch the final batch to A + (B survivors), write vis_next.
            # The colour HDR pass below then draws the full this-frame-visible set,
            # so a newly disoccluded object appears the frame it becomes visible.
            # NOTE on async-compute: the two-phase occlusion + Hi-Z
            # build are deliberately NOT routed onto the dedicated compute queue.
            # They interleave a GRAPHICS draw (the scratch depth-prepass) between
            # the two compute phases: phase-1 (compute) -> depth-prepass
            # (graphics) -> Hi-Z build (compute, reads the scratch depth) ->
            # phase-2 (compute, reads the fresh Hi-Z) -> main draw (graphics,
            # reads the patched indirect buffer). Hoisting only the compute parts
            # to a separate queue would need multiple compute/graphics submits
            # with semaphore round-trips mid-frame, which is a correctness-risky
            # reorder (mirrors leaving IBL on graphics).
            # So this records inline on the graphics ``cmd`` in BOTH modes, with
            # its existing intra-queue producer->consumer barriers intact. A
            # rig-side follow-up may split it across queues if profiling warrants.
            if self._occlusion_culling_enabled and (self._instances or self._multimesh_draws):
                self._two_phase_occlusion(cmd, viewports, pool)

            # TAA: capture the *unjittered* current/previous view-projection (for
            # the future depth-reproject resolve), then jitter ONLY the projection
            # uploaded for the forward draw. ``camera_proj`` is restored after the
            # HDR pass so culling, motion blur, SSAO, and fog keep clean matrices.
            # Gated on taa_enabled so the no-TAA path is byte-for-byte unchanged.
            _taa_unjittered_proj = None
            if pp.taa_enabled and viewports:
                from simvx.core.math import apply_jitter

                _, vp = viewports[0]
                pp.update_taa_matrices(vp.camera_view, vp.camera_proj)
                # Jitter in INTERNAL-extent pixels: the forward draw
                # rasterises at the HDR target's size, so the sub-pixel offset is
                # relative to it. Equals the output extent at render_scale 1.0.
                rt = pp.hdr_target
                w, h = (rt.width, rt.height) if rt is not None else self._engine.extent
                jx, jy = pp.next_taa_jitter(w, h)
                _taa_unjittered_proj = vp.camera_proj
                vp.camera_proj = apply_jitter(vp.camera_proj, jx, jy, w, h)

            # hdr_output=1: leave linear HDR for the post-process chain (the
            # fragment shader skips its built-in tone mapping). Supplied as a
            # per-SRU push constant, no shadow-SSBO write.
            if pool:
                pool.begin(cmd, "forward")
            pp.begin_hdr_pass(cmd)
            self._scene_renderer.render_scene_content(cmd, hdr_output=1)
            # Draw the world-space 2D lane INTO the HDR target (same open render
            # pass, after the 3D content) so bloom-extract sees it and tonemap maps
            # it consistently with the 3D scene. The screen-space lane stays on the
            # swapchain after the tonemap blit (see ``render``).
            if self._packet_item_view is not None and self._hdr_bindless_submitter is not None:
                rt = pp.hdr_target
                win = self._engine._window
                ws = win.get_window_size() if win and hasattr(win, "get_window_size") else None
                uw, uh = (ws[0], ws[1]) if ws else (0, 0)
                # Per-layer post bands are drawn into their OWN HDR chain + composited
                # on the swapchain, so exclude them from the global HDR lane to avoid
                # a double draw. ``None`` (no post band this frame) keeps this submit
                # byte-identical to the plain path.
                self._hdr_bindless_submitter.render(
                    cmd, self._packet_item_view, rt.width, rt.height,
                    ui_width=uw, ui_height=uh, camera=self._packet_item_camera, lane="hdr",
                    exclude_bands=frozenset(self._frame_post_bands) or None,
                )
            pp.end_hdr_pass(cmd)
            if pool:
                pool.end(cmd, "forward")
            if _taa_unjittered_proj is not None:
                _, vp = viewports[0]
                vp.camera_proj = _taa_unjittered_proj
            self._hdr_rendered = True

            # Per-object velocity pass (TAA only): re-draw opaque instances into
            # an RG16F target using the UNJITTERED cur/prev VP + prev-frame model
            # SSBO. Produces the motion-vector source the resolve will sample (the
            # resolve hookup is a later stage). Lazily created on first TAA frame.
            if pp.taa_enabled:
                if pool:
                    pool.begin(cmd, "velocity")
                self._render_velocity(cmd, pp)
                if pool:
                    pool.end(cmd, "velocity")
            # Run bloom pass (extract + blur) between HDR and tonemap
            if pp.bloom_enabled:
                if pool:
                    pool.begin(cmd, "bloom")
                pp.render_bloom(cmd)
                if pool:
                    pool.end(cmd, "bloom")
            # Run SSAO after HDR pass (needs depth from HDR target). Only flag it
            # ACTIVE for the tonemap when it actually runs this frame: SSAO needs a
            # 3D viewport (depth), so a 2D scene never renders the AO texture. Marking
            # ssao_enabled True regardless made tonemap sample an uninitialised (zero)
            # AO target and multiply every HDR pixel to black -- which silently killed
            # all HDR-routed 2D content (e.g. GPU particles) in cameraless 2D scenes.
            if self._ssao_pass:
                viewports = self.viewport_manager.viewports
                pp.ssao_enabled = self._ssao_pass.enabled and bool(viewports)
                if self._ssao_pass.enabled:
                    if viewports:
                        _, vp = viewports[0]
                        if pool:
                            pool.begin(cmd, "ssao")
                        self._ssao_pass.render(cmd, vp.camera_proj, vp.camera_view)
                        if pool:
                            pool.end(cmd, "ssao")

            # Screen-space reflections: traces the reflected
            # view ray against this frame's depth/gbuffer/colour and writes the
            # b16 indirect-specular hook the NEXT frame's uber samples. Runs after
            # the HDR pass (needs its final colour + depth + gbuffer) and restores
            # the HDR colour so the fog/TAA/tonemap samplers below are unaffected.
            if self._ssr_pass is not None and self._ssr_pass.enabled:
                viewports = self.viewport_manager.viewports
                if viewports:
                    _, vp = viewports[0]
                    if pool:
                        pool.begin(cmd, "ssr")
                    self._ssr_pass.render(cmd, vp.camera_proj, vp.camera_view)
                    if pool:
                        pool.end(cmd, "ssr")

            # Screen-space global illumination: the diffuse
            # sibling of SSR. Casts a hemisphere of screen-space rays against this
            # frame's depth/gbuffer/colour, temporally denoises, and writes the b17
            # indirect-diffuse hook the NEXT frame's uber samples. Same placement
            # rationale as SSR (needs the settled HDR colour + depth + gbuffer;
            # restores the HDR colour for the fog/TAA/tonemap samplers below).
            if self._ssgi_pass is not None and self._ssgi_pass.enabled:
                viewports = self.viewport_manager.viewports
                if viewports:
                    _, vp = viewports[0]
                    if pool:
                        pool.begin(cmd, "ssgi")
                    self._ssgi_pass.render(cmd, vp.camera_proj, vp.camera_view)
                    if pool:
                        pool.end(cmd, "ssgi")

            # Distance/height fog is applied in tonemap.frag. Volumetric fog is
            # a separate ray-march pass (below) that composites in HDR space and
            # mutually excludes the analytic tonemap fog branch.
            vfog = self._volumetric_fog_pass
            self._volumetric_fog_active = False
            if vfog and vfog.enabled:
                viewports = self.viewport_manager.viewports
                if viewports:
                    _, vp = viewports[0]
                    sun_dir, sun_col, sun_int = self._directional_sun()
                    volumes = self._collect_fog_volumes()
                    vfog.set_frame_data(vp.camera_view, vp.camera_proj, sun_dir, sun_col, sun_int, volumes)
                    if pool:
                        pool.begin(cmd, "volumetric_fog")
                    vfog.render(cmd)
                    if pool:
                        pool.end(cmd, "volumetric_fog")
                    # Tonemap samples the fog-composited HDR copy. The descriptor
                    # swap was done in sync_world_environment (before recording)
                    # to avoid updating a bound descriptor set mid-frame.
                    self._volumetric_fog_active = True

            # TAA resolve: runs AFTER forward + volumetric fog, BEFORE tonemap.
            # Reconstructs camera motion from depth + the previous VP, reprojects
            # history, YCoCg-clamps + blends, writes the resolved HDR. The tonemap
            # HDR-input swap (to the TAA output) is decided in sync_world_environment
            # before recording, so we never update a bound descriptor mid-frame.
            # Its current-frame input (binding 0) is the fog output when fog is
            # active, else the raw HDR colour; that swap is also done pre-recording.
            taa = self._taa_pass
            if taa and taa.enabled:
                # The jitter (internal px) lets the TAAU permutation un-jitter
                # its internal-res sample; the same-size resolve ignores it.
                taa.set_frame_matrices(pp.taa_inv_vp, pp.taa_prev_vp, pp.taa_jitter)
                # Hand the resolve the per-object velocity buffer (moving meshes
                # reproject through true velocity; sentinel pixels fall back to
                # the depth-based camera reprojection). None until the velocity
                # pass has run, in which case the resolve uses depth everywhere.
                vpass = self._velocity_pass
                taa.set_velocity_view(vpass.velocity_view if (vpass and vpass.ready) else None)
                if pool:
                    pool.begin(cmd, "taa")
                taa.render(cmd)
                if pool:
                    pool.end(cmd, "taa")
                # The tonemap samples the parity just written (its per-parity
                # sets were wired to the two ping-pong targets in env-sync).
                pp.tonemap_set_index = taa.written_index
            else:
                pp.tonemap_set_index = 0

            # Run custom user post-process effects (after bloom/SSAO, before tonemap)
            if pool and self._custom_pp and getattr(self._custom_pp, "has_effects", False):
                pool.begin(cmd, "custom_pp")
                self._env_sync.run_custom_post_process(cmd, pp)
                pool.end(cmd, "custom_pp")
            else:
                self._env_sync.run_custom_post_process(cmd, pp)



[docs]
    def render(self, cmd: Any) -> None:
        """Record draw commands for all viewports."""
        if not self._ready:
            return

        e = self._engine
        pp = self._post_process
        pool = e.current_timestamp_pool

        # If post-processing rendered HDR content in pre_render, tonemap it now.
        # Skip tonemap when no 3D content was rendered (e.g. editor with only UI nodes)
        # to avoid sampling an undefined/stale HDR render target.
        if pp and pp.enabled and self._hdr_rendered:
            if pool:
                pool.begin(cmd, "tonemap")
            # Pure-2D bloom: a frame that carried ONLY 2D into the HDR pass uses
            # linear tonemap (mode 4) so flat 2D art keeps its authored colour
            # (ACES would crush it) while bloom still glows. Mixed 2D+3D keeps the
            # configured tonemap. Restored after the blit.
            _saved_tm = pp.tonemap_mode
            if self._hdr_2d_only:
                pp.tonemap_mode = 4
            pp.render_tonemap(cmd, e.extent[0], e.extent[1])
            pp.tonemap_mode = _saved_tm
            if pool:
                pool.end(cmd, "tonemap")
            # No HDR-input restore here: the tonemap reroute (fog / TAA / custom
            # post-process) is resolved once per frame in sync_world_environment,
            # before recording, so the bound descriptor is never rewritten while a
            # frame is in flight.
        elif not (pp and pp.enabled):
            # Direct rendering to swapchain (no post-processing): tone-map in the
            # fragment shader (hdr_output=0).
            if pool:
                pool.begin(cmd, "forward")
            self._scene_renderer.render_scene_content(cmd, hdr_output=0)
            if pool:
                pool.end(cmd, "forward")

        # 2D overlays always go to swapchain
        # Render 2D drawing overlay: pass window size for UI coordinate conversion
        if self._draw2d_pass and self._packet_item_view is not None:
            # 2D item pipeline: submit the
            # published item view through the BINDLESS co-batched submitter (one
            # unified ui2d pipeline, per-vertex texture_id + is_msdf, so
            # consecutive same-(clip, blend) items co-batch across textures and
            # across sprite+glyph). The camera affine is applied at submit, not
            # baked.
            win = self._engine._window
            ws = win.get_window_size() if win and hasattr(win, "get_window_size") else None
            if pool:
                pool.begin(cmd, "draw2d")
            uw, uh = (ws[0], ws[1]) if ws else (0, 0)
            # When the HDR pass ran, the world-space lane was already drawn into
            # the HDR target (and tonemapped); the swapchain only takes the
            # screen-space lane. Otherwise (post off, or HDR pass skipped) the
            # swapchain takes EVERYTHING ("all") so no world 2D is dropped.
            lane = "ldr" if (pp and pp.enabled and self._hdr_rendered) else "all"
            submitter = self._ensure_item_submitter()
            if not self._frame_post_bands:
                # Common path (no per-layer post this frame): ONE global submit,
                # byte-identical to the plain path.
                submitter.render(
                    cmd, self._packet_item_view, e.extent[0], e.extent[1],
                    ui_width=uw, ui_height=uh, camera=self._packet_item_camera, lane=lane,
                )
            else:
                self._render_2d_with_layer_post(
                    cmd, submitter, e.extent[0], e.extent[1], uw, uh, lane,
                )
            if pool:
                pool.end(cmd, "draw2d")


    def _render_2d_with_layer_post(
        self, cmd: Any, submitter: Any, width: int, height: int, uw: int, uh: int, lane: str,
    ) -> None:
        """Interleave the global 2D submit with per-band post composites.

        Compositing-order approach: post bands and plain bands can interleave in z
        (band 0 plain, band 1 bloomed, band 2 plain). To keep painter order correct
        we walk the present post bands ascending and, for each, first draw the plain
        SEGMENT of bands strictly below it (a band-range global submit that skips the
        post bands), then alpha-composite that band's post result on top. After the
        last post band we draw the remaining plain bands above it. Each band-range
        submit and each composite are issued in ascending-band order, so the final
        swapchain z-order matches the published ``(layer, seq)`` order exactly. The
        post bands are EXCLUDED from every plain segment (they are drawn through
        their own chain), so nothing is drawn twice.

        This path runs ONLY when at least one post band drew this frame; the
        feature-off path above is a single unsegmented submit (zero added cost).
        """
        view = self._packet_item_view
        camera = self._packet_item_camera
        post_bands = self._frame_post_bands
        post_set = frozenset(post_bands)
        lo: int | None = None  # inclusive lower bound of the current plain segment
        for band in post_bands:
            # Plain bands strictly below this post band: [lo, band - 1].
            submitter.render(
                cmd, view, width, height, ui_width=uw, ui_height=uh, camera=camera, lane=lane,
                exclude_bands=post_set, min_band=lo, max_band=band - 1,
            )
            self._layer_post_chains[band].composite(cmd, width, height)
            lo = band + 1
        # Plain bands above the highest post band: [lo, +inf).
        submitter.render(
            cmd, view, width, height, ui_width=uw, ui_height=uh, camera=camera, lane=lane,
            exclude_bands=post_set, min_band=lo,
        )


[docs]
    def apply_gbuffer_state(self, active: bool) -> None:
        """Activate/deactivate the thin G-buffer.

        Flipping it changes the HDR pass's colour-attachment count, so the HDR
        target and every pipeline drawing into it must be rebuilt. This is rare
        (a consumer toggling on/off) and idempotent: a no-op when the state
        already matches. Requires the post-process HDR pipeline (the G-buffer
        rides its second attachment); ignored until it exists.

        Mirrors the ``env_sync`` device-wait-idle + rebuild idiom: it only runs
        on a transition, never per frame.
        """
        e = self._engine
        want = bool(active)
        if bool(getattr(e, "gbuffer_active", False)) == want:
            return
        # independentBlend guard: the thin G-buffer's MRT pass masks
        # attachment 1 independently of attachment 0, which requires the
        # independentBlend device feature (requested at device creation only when
        # supported). Without it the per-attachment blend state is invalid, so
        # refuse activation and let the screen-space consumer (SSAO-normals / SSR
        # / SSGI / decals) fall back to its no-G-buffer path. Deactivation is
        # always allowed. Warn once so a capable device stays silent.
        if want:
            caps = getattr(e, "capabilities", None)
            if caps is not None and not caps.independent_blend:
                if not self._gbuffer_guard_warned:
                    log.warning(
                        "Thin G-buffer requested but the device lacks independentBlend; "
                        "screen-space effects (SSAO normals / SSR / SSGI / decals) will use their "
                        "no-G-buffer fallback."
                    )
                    self._gbuffer_guard_warned = True
                return
        pp = self._post_process
        if pp is None or not getattr(pp, "enabled", False) or getattr(pp, "hdr_target", None) is None:
            return
        vk.vkDeviceWaitIdle(e.ctx.device)
        e.gbuffer_active = want
        w, h = e.extent
        # Reuse the resize path: it recreates the HDR target (now reading
        # gbuffer_active, so the second attachment appears/disappears), rebuilds
        # the forward mesh pipelines with the matching attachment count, and
        # re-points SSAO at the new (gbuffer) normal view.
        self.resize(w, h)
        # The forward pipelines + SSAO were rebuilt by resize; now rebuild the
        # other HDR-pass pipelines (skybox/grid/particle/billboard/tilemap/gizmo),
        # which resize leaves alone because they normally rely on render-pass
        # compatibility (broken by the attachment-count change).
        hdr_rp = self._passes.pipeline_render_pass()
        self._passes.rebuild_hdr_pass_pipelines(hdr_rp)
        # The HDR-lane 2D submitter caches a pipeline against the old attachment
        # count; drop it so it lazily rebuilds against the new HDR pass.
        if self._hdr_bindless_pass is not None:
            self._hdr_bindless_pass.set_atlas_slot(-1)
            self._hdr_bindless_pass.cleanup()
            self._hdr_bindless_pass = None
            self._hdr_bindless_submitter = None

        # ShaderMaterial pipelines rebuild themselves on the next draw (the HDR
        # render-pass handle changed, which their cache watches).

    def _bind_indirect_specular(self, view: Any, sampler: Any, *, general: bool) -> None:
        """Point set0 b16 at *view* (indirect-specular hook).

        ``general`` selects the descriptor image layout: True for the SSR output
        (a GENERAL-layout storage image sampled directly, the SSAO precedent),
        False for the shared 1x1 placeholder (SHADER_READ_ONLY).
        """
        layout = vk.VK_IMAGE_LAYOUT_GENERAL if general else vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
        with DescriptorWriteBatch(self._engine.ctx.device) as b:
            for ssbo_set in self._buffers.ssbo_sets:
                b.image(ssbo_set, 16, view, sampler, image_layout=layout)


[docs]
    def apply_ssr_state(
        self, active: bool, *, intensity: float = 1.0, max_distance: float = 40.0, roughness_cutoff: float = 0.6,
    ) -> None:
        """Activate/deactivate screen-space reflections.

        SSR requires the post-process HDR path AND the thin G-buffer (env-sync
        turns the G-buffer on for SSR before calling this). When it cannot run,
        or is disabled, b16 falls back to the placeholder and the uber's
        ``indirect_specular_enabled`` gate goes 0: byte-identical to feature-off.
        Idempotent: the descriptor rebind + flag flip only happen on a transition.
        """
        pp = self._post_process
        can_run = bool(
            active
            and pp is not None
            and getattr(pp, "enabled", False)
            and getattr(pp, "hdr_target", None) is not None
            and bool(getattr(self._engine, "gbuffer_active", False))
        )
        if can_run:
            rt = pp.hdr_target
            if self._ssr_pass is None:
                vk.vkDeviceWaitIdle(self._engine.ctx.device)
                self._ssr_pass = SSRPass(self._engine)
                self._ssr_pass.setup(
                    rt.width, rt.height, rt.colour_image, rt.depth_view, rt.depth_image,
                    rt.gbuffer_view, rt.gbuffer_image,
                )
            p = self._ssr_pass
            p.enabled = True
            p.intensity = float(intensity)
            p.max_distance = float(max_distance)
            p.roughness_cutoff = float(roughness_cutoff)
            if not self._indirect_specular_enabled:
                self._bind_indirect_specular(p.output_view, p.output_sampler, general=True)
                self._indirect_specular_enabled = True
        else:
            if self._ssr_pass is not None:
                self._ssr_pass.enabled = False
            if self._indirect_specular_enabled:
                self._bind_indirect_specular(
                    self._buffers.placeholder_lut_view, self._buffers.placeholder_cubemap_sampler, general=False,
                )
                self._indirect_specular_enabled = False


    def _bind_indirect_diffuse(self, view: Any, sampler: Any, *, general: bool) -> None:
        """Point set0 b17 at *view* (indirect-diffuse hook).

        ``general`` selects the descriptor image layout: True for the SSGI output
        (a GENERAL-layout storage image sampled directly, the SSR/SSAO precedent),
        False for the shared 1x1 placeholder (SHADER_READ_ONLY).
        """
        layout = vk.VK_IMAGE_LAYOUT_GENERAL if general else vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
        with DescriptorWriteBatch(self._engine.ctx.device) as b:
            for ssbo_set in self._buffers.ssbo_sets:
                b.image(ssbo_set, 17, view, sampler, image_layout=layout)


[docs]
    def apply_ssgi_state(
        self, active: bool, *, intensity: float = 1.0, max_distance: float = 8.0,
    ) -> None:
        """Activate/deactivate screen-space global illumination.

        The diffuse mirror of :meth:`apply_ssr_state`. SSGI requires the
        post-process HDR path AND the thin G-buffer (env-sync turns the G-buffer
        on for SSGI before calling this). When it cannot run, or is disabled, b17
        falls back to the placeholder and the uber's ``indirect_diffuse_enabled``
        gate goes 0: byte-identical to feature-off. Idempotent: the descriptor
        rebind + flag flip only happen on a transition.
        """
        pp = self._post_process
        can_run = bool(
            active
            and pp is not None
            and getattr(pp, "enabled", False)
            and getattr(pp, "hdr_target", None) is not None
            and bool(getattr(self._engine, "gbuffer_active", False))
        )
        if can_run:
            rt = pp.hdr_target
            if self._ssgi_pass is None:
                vk.vkDeviceWaitIdle(self._engine.ctx.device)
                self._ssgi_pass = SSGIPass(self._engine)
                self._ssgi_pass.setup(
                    rt.width, rt.height, rt.colour_image, rt.depth_view, rt.depth_image,
                    rt.gbuffer_view, rt.gbuffer_image,
                )
            p = self._ssgi_pass
            p.enabled = True
            p.intensity = float(intensity)
            p.max_distance = float(max_distance)
            if not self._indirect_diffuse_enabled:
                self._bind_indirect_diffuse(p.output_view, p.output_sampler, general=True)
                self._indirect_diffuse_enabled = True
        else:
            if self._ssgi_pass is not None:
                self._ssgi_pass.enabled = False
            if self._indirect_diffuse_enabled:
                self._bind_indirect_diffuse(
                    self._buffers.placeholder_lut_view, self._buffers.placeholder_cubemap_sampler, general=False,
                )
                self._indirect_diffuse_enabled = False



[docs]
    def apply_probe_quality(self) -> None:
        """Apply the WorldEnvironment reflection-probe quality dials.

        ``_probe_blend_count`` / ``_probe_face_size`` are written by the
        env-sync spec each frame; ``None`` keeps the built-in defaults (top-2
        blend, 128 px faces), i.e. the exact pre-knob paths. Both downstream
        setters are transition-gated no-ops when the effective value is
        unchanged, so this per-frame call is free in steady state. A blend
        count of 0 leaves the shader permutation alone: the capture gate in
        :meth:`capture_reflection_probes` keeps the probe set empty, which
        makes the blend loop a branch-free no-op.
        """
        blend = self._probe_blend_count
        if blend is not None and int(blend) >= 1:
            self._pipelines.set_probe_blend_count(int(blend))
        elif blend is None:
            self._pipelines.set_probe_blend_count(2)
        if self._reflection_probe_pass is not None:
            self._reflection_probe_pass.set_face_size(self._probe_face_size)


    @property
    def render_scale(self) -> float:
        """Resolution scale of the main-view HDR chain.

        ``internal_extent = ceil(output_extent * render_scale)`` sizes the whole
        HDR chain (colour + depth, SSAO, bloom inputs, volumetric fog,
        velocity); the tonemap fullscreen draw bilinearly upscales to the
        swapchain. With TAA enabled the resolve upgrades that upscale to TAAU:
        its output + history live at the OUTPUT extent and accumulate
        sub-pixel detail from the jittered internal-res frames. 1.0 (default)
        is byte-identical to the unscaled path. Driven by
        ``WorldEnvironment.render_scale`` via the env-sync spec; an actual
        change drains the device and rebuilds the chain (the same rare
        transition as a window resize), so per-frame re-assignment of the same
        value is free.
        """
        return self._render_scale


[docs]
    @render_scale.setter
    def render_scale(self, value: float) -> None:
        value = min(4.0, max(0.05, float(value)))
        if value == self._render_scale:
            return
        self._render_scale = value
        if not self._ready:
            return
        e = self._engine
        vk.vkDeviceWaitIdle(e.ctx.device)
        self.resize(*e.extent)



[docs]
    def internal_extent(self, width: int | None = None, height: int | None = None) -> tuple[int, int]:
        """HDR-chain render extent: ``ceil(output * render_scale)``, min 1 px.

        Defaults to the engine's current output extent. Identity when
        ``render_scale`` is 1.0 or post-processing is off (no HDR chain means no
        upscale point, so the scale is inert and shaders must not be told
        otherwise).
        """
        if width is None or height is None:
            width, height = self._engine.extent
        s = self._render_scale
        pp = self._post_process
        if s == 1.0 or pp is None or not pp.enabled:
            return int(width), int(height)
        return max(1, math.ceil(width * s)), max(1, math.ceil(height * s))


    def _main_view_scale(self) -> tuple[float, float]:
        """Exact internal/output extent ratio of the main HDR chain.

        (1.0, 1.0) whenever the chain is unscaled: the early return keeps the
        default path free of the HDR-target division and guarantees the literal
        identity ratio (viewport rects scale to exactly their original values).
        """
        if self._render_scale == 1.0:
            return (1.0, 1.0)
        pp = self._post_process
        rt = pp.hdr_target if (pp is not None and pp.enabled) else None
        if rt is None:
            return (1.0, 1.0)
        w, h = self._engine.extent
        if not w or not h:
            return (1.0, 1.0)
        return (rt.width / w, rt.height / h)


[docs]
    def resize(self, width: int, height: int) -> None:
        """Handle framebuffer resize: recreate post-process targets + 3D pipelines.

        ``width`` / ``height`` are the OUTPUT (swapchain) extent; the HDR chain
        and every pass that renders in HDR space is (re)created at the internal
        extent. Identical to the output extent at render_scale 1.0.
        """
        if not self._ready:
            return
        iw, ih = self.internal_extent(width, height)
        # Resize post-process first so HDR render-pass exists when pipelines rebuild.
        # The output extent rides along for TAA: when it differs from the internal
        # extent, the TAA resolve doubles as the upscaler (TAAU) and
        # sizes its output + history targets at the swapchain extent.
        self._passes.resize(iw, ih, output_extent=(int(width), int(height)))
        self._pipelines.rebuild_for_resize(iw, ih, self._passes.pipeline_render_pass())
        if self._velocity_pass is not None:
            self._velocity_pass.resize(iw, ih)
        # Screen-space reflections: re-point at the rebuilt HDR chain /
        # G-buffer (extent change, render-scale change, or G-buffer toggle).
        if self._ssr_pass is not None:
            pp = self._post_process
            rt = pp.hdr_target if (pp is not None and pp.enabled) else None
            if rt is not None and rt.gbuffer_view is not None:
                self._ssr_pass.resize(
                    rt.width, rt.height, rt.colour_image, rt.depth_view, rt.depth_image,
                    rt.gbuffer_view, rt.gbuffer_image,
                )
        # SSGI: same re-point at the rebuilt HDR chain / G-buffer.
        if self._ssgi_pass is not None:
            pp = self._post_process
            rt = pp.hdr_target if (pp is not None and pp.enabled) else None
            if rt is not None and rt.gbuffer_view is not None:
                self._ssgi_pass.resize(
                    rt.width, rt.height, rt.colour_image, rt.depth_view, rt.depth_image,
                    rt.gbuffer_view, rt.gbuffer_image,
                )
        # Two-phase occlusion: recreate the scratch depth target and re-point the
        # Hi-Z pyramid at it. The pass orchestrator's resize already recreated the
        # Hi-Z image at the new extent (but against the OLD depth view), so re-run
        # setup's retarget here. Force the cull to skip until the pyramid is rebuilt
        # and refill visibility so nothing is dropped on the next frame.
        if self._depth_prepass is not None:
            self._depth_prepass.resize(iw, ih)
            if self._passes.hiz_pass is not None:
                self._passes.hiz_pass.resize(
                    iw,
                    ih,
                    self._depth_prepass.depth_view,
                    self._depth_prepass.depth_image,
                )
        self._hiz_built_once = False
        self._occ_structure_version = -2
        # Per-layer post chains are sized to the old extent; drop them so they
        # rebuild lazily at the new extent on the next armed frame.
        for chain in self._layer_post_chains.values():
            chain.cleanup()
        self._layer_post_chains.clear()



[docs]
    def cleanup(self) -> None:
        """Release all GPU resources."""
        if not self._ready:
            return
        device = self._engine.ctx.device

        # IBL precompute maps (retained from the last set_skybox).
        if self._ibl_pass is not None:
            self._ibl_pass.cleanup()
            self._ibl_pass = None

        # Reflection-probe pass (shared cubemap arrays + capture scratch).
        if self._reflection_probe_pass is not None:
            self._reflection_probe_pass.cleanup()
            self._reflection_probe_pass = None

        # Irradiance-volume pass (SH-reduce compute + capture scratch).
        if self._irradiance_volume_pass is not None:
            self._irradiance_volume_pass.cleanup()
            self._irradiance_volume_pass = None

        # Custom ShaderMaterial pipelines + shared camera/transforms buffers.
        if self._shader_material_manager is not None:
            self._shader_material_manager.cleanup(device)
            self._shader_material_manager = None

        # Bindless co-batched 2D pass (lazily created by the item path).
        if self._bindless_pass is not None:
            self._bindless_pass.cleanup()
            self._bindless_pass = None
            self._bindless_submitter = None
        # HDR-lane 2D pass (shares the atlas slot, so cleanup must NOT unregister
        # it -- the swapchain pass above owns the registration; mirror the slot to
        # -1 first so its cleanup() skips the unregister).
        if self._hdr_bindless_pass is not None:
            self._hdr_bindless_pass.set_atlas_slot(-1)
            self._hdr_bindless_pass.cleanup()
            self._hdr_bindless_pass = None
            self._hdr_bindless_submitter = None

        # Per-CanvasLayer post chains (only exist if a band opted in + drew).
        for chain in self._layer_post_chains.values():
            chain.cleanup()
        self._layer_post_chains.clear()

        # Scene colour/depth copy targets (fallback + lazy full-size textures).
        if self._scene_copy is not None:
            self._scene_copy.destroy()
            self._scene_copy = None

        # All render passes + skybox cubemap (orchestrator owns lifecycle).
        self._passes.cleanup()

        # Velocity pass + its dedicated indirect batch (only exist if TAA ran).
        if self._velocity_pass is not None:
            self._velocity_pass.cleanup()
            self._velocity_pass = None

        # SSR pass (only exists if SSR was ever active).
        if self._ssr_pass is not None:
            self._ssr_pass.cleanup()
            self._ssr_pass = None

        # SSGI pass (only exists if SSGI was ever active).
        if self._ssgi_pass is not None:
            self._ssgi_pass.cleanup()
            self._ssgi_pass = None

        # Two-phase occlusion machinery (only exists if the toggle was ever on).
        if self._occlusion_pass is not None:
            self._occlusion_pass.cleanup()
            self._occlusion_pass = None
        if self._depth_prepass is not None:
            self._depth_prepass.cleanup()
            self._depth_prepass = None
        for buf, mem in self._vis_buffers:
            if buf:
                vk.vkDestroyBuffer(device, buf, None)
            if mem:
                vk.vkFreeMemory(device, mem, None)
        self._vis_buffers = []

        # Batch objects. ``_multimesh_batch`` MUST be included: it owns an indirect
        # buffer + memory (created in setup) and was previously omitted here,
        # leaking a (VkDeviceMemory, VkBuffer) pair on every vkDestroyDevice for
        # any scene using MultiMesh.
        for batch in (
            self._batch,
            self._transparent_batch,
            self._velocity_batch,
            self._occ_batch_opaque,
            self._occ_batch_double,
            self._multimesh_batch,
        ):
            if batch:
                batch.destroy()

        # Debug line pipeline (lazily created by OverlayRenderer, not owned by PipelineManager)
        if self._debug_pipeline:
            vk.vkDestroyPipeline(device, self._debug_pipeline, None)
        if self._debug_pipeline_layout:
            vk.vkDestroyPipelineLayout(device, self._debug_pipeline_layout, None)

        # 3D pipelines + shader modules
        self._pipelines.cleanup()

        # Debug shader modules (lazily created)
        for mod in (self._debug_vert_module, self._debug_frag_module):
            if mod:
                vk.vkDestroyShaderModule(device, mod, None)

        # Debug vertex buffer (lazily created by OverlayRenderer)
        if self._debug_vb:
            vk.vkDestroyBuffer(device, self._debug_vb, None)
        if self._debug_vb_mem:
            vk.vkFreeMemory(device, self._debug_vb_mem, None)

        # SSBOs + descriptor sets + placeholder cubemap
        self._buffers.cleanup()

        self._ready = False



[docs]
    def destroy(self) -> None:
        """ABC destroy: delegates to cleanup."""
        self.cleanup()


    # -- Resource management (delegate to engine) --


[docs]
    def register_mesh(self, streams: VertexStreams, indices: np.ndarray) -> MeshHandle:
        """Register a mesh's vertex streams on GPU via engine's mesh registry."""
        return self._engine.mesh_registry.register(streams, indices)



[docs]
    def upload_texture_pixels(self, pixels: np.ndarray, width: int, height: int) -> int:
        """Upload RGBA pixel data to GPU, return bindless texture index."""
        return self._engine.upload_texture_pixels(pixels, width, height)


    # -- Frame capture --


[docs]
    def capture_frame(self) -> np.ndarray:
        """Capture the last rendered frame as (H, W, 4) uint8 RGBA numpy array."""
        return self._engine.capture_frame()