Source code for simvx.graphics.renderer.light2d_pass

"""2D light accumulation pass: renders Light2D nodes as additive radial gradients.

Renders each light as a screen-space quad with radial falloff. The accumulated
light texture is provided to Draw2DPass for final compositing (multiply blend).

Shadow casting (desktop, POINT + DIRECTIONAL lights) follows the FROZEN Phase-0
spec (``shadow_ref.py``). Three GPU stages, run only when a frame has at least
one shadow-casting light AND at least one occluder:

  S1  rasterise occluder triangles into a screen-sized R8 coverage mask
      (``light2d_occluder.*``).
  S2  build a 1D shadow atlas (R16F, W columns x MAX_SHADOW_LIGHTS rows): each
      shadow light ray-marches the occ mask into its own row
      (``light2d_shadowmap.*``, one draw per light into a 1-px viewport strip).
  S3  the disk fragment shader (``light2d.frag``) PCF-samples its atlas row and
      folds visibility into the lit colour.

When no light casts a shadow the atlas/mask are never allocated and every light
pushes ``shadow_row = -1`` -> byte-identical to the pre-shadow path.
"""

import logging
import struct
from typing import Any

import numpy as np
import vulkan as vk

from simvx.core.light2d import DEFAULT_SHADOW_COLOUR

from ..gpu.memory import create_buffer, create_image, upload_image_data, upload_numpy
from ..gpu.pipeline import PipelineSpec, build_pipeline, create_shader_module
from ..gpu.pipeline_cache import pipeline_cache_for
from ..materials.shader_compiler import compile_shader
from ..types import SHADER_DIR
from .passes import create_offscreen_pass

__all__ = ["Light2DPass"]

log = logging.getLogger(__name__)

# Push constant layout must match light2d.vert/frag LightPush struct (std430):
# vec2 screen_size (8) + vec2 light_pos (8) + float range (4) + float falloff (4)
# + float energy (4) + float inner_radius (4) + vec4 light_colour (16)
# + float shadow_row (4) + float shadow_strength (4) + float shadow_softness (4)
# + float light_type (4) + vec4 shadow_tint (16) + vec2 light_dir (8) = 88 bytes.
# light_colour.a doubles as the has-gradient flag (0 = analytic, 1 = LUT).
# shadow_row < 0 disables shadowing for that light (byte-identical fast path).
# light_type (0 = point, 1 = directional) + light_dir drive the directional branch.
PUSH_SIZE = 88

# S1 occluder push: vec2 screen_size = 8 bytes.
OCC_PUSH_SIZE = 8
# S2 shadow-map build push: vec2 screen_size (8) + vec2 light_pos (8)
# + vec2 direction (8) + float range (4) + float light_type (4) = 32 bytes.
SM_PUSH_SIZE = 32

# Shadow atlas geometry (FROZEN spec: shadow_ref.py). W columns per row, one row
# per shadow-casting light. RAYMARCH_STEPS / N_PCF live in the GLSL as constants.
SHADOW_MAP_RESOLUTION = 256
MAX_SHADOW_LIGHTS = 16
RAYMARCH_STEPS = 256
N_PCF = 5

# Max distinct gradient LUTs bound per frame (one descriptor set each). Lights
# beyond this fall back to the analytic curve rather than crash.
MAX_GRADIENT_SETS = 32


[docs]
class Light2DPass:
    """GPU pass that renders 2D lights to an offscreen accumulation texture.

    Usage from the forward renderer:
        1. ``begin_frame()``: clears per-frame submission lists
        2. ``submit_light(...)`` / ``submit_occluder(...)``: queue data
        3. ``render(cmd, extent)``: render all lights to accumulation RT
        4. ``get_light_texture_view()``: returns the image view for compositing
    """

    def __init__(self, engine: Any):
        self._engine = engine
        # Pipeline resources
        self._pipeline: Any = None
        self._pipeline_layout: Any = None
        self._vert_module: Any = None
        self._frag_module: Any = None
        # Offscreen render target
        self._rt_image: Any = None
        self._rt_memory: Any = None
        self._rt_view: Any = None
        self._rt_render_pass: Any = None
        self._rt_framebuffer: Any = None
        self._rt_sampler: Any = None
        self._rt_width = 0
        self._rt_height = 0
        # Per-frame submissions
        self._lights: list[dict] = []
        self._occluders: list[list[tuple[float, float]]] = []
        # Flat 2D ambient floor (rgb added to lit fragments by Draw2DPass).
        # Synced from WorldEnvironment.ambient_light_2d via env_sync; defaults
        # to the dark-neutral floor scenes had before that field existed.
        self.ambient_light_2d: tuple[float, float, float, float] = (0.2, 0.2, 0.2, 1.0)
        # --- Shadow (S1/S2/S3) GPU state -----------------------------------
        # Persistent (setup) infra: shaders, pipelines, render passes, samplers,
        # the set-1 descriptor layout/pool and a 1x1 dummy atlas bound when no
        # light casts a shadow. Lazy (per-need) resources: the screen-sized occ
        # mask, the R16F atlas, their framebuffers, and the occluder VB - only
        # allocated when a frame actually has a shadow light + an occluder.
        self._occ_vert_module: Any = None
        self._occ_frag_module: Any = None
        self._sm_vert_module: Any = None
        self._sm_frag_module: Any = None
        self._occ_pipeline: Any = None
        self._occ_pipeline_layout: Any = None
        self._sm_pipeline: Any = None
        self._sm_pipeline_layout: Any = None
        self._occ_render_pass: Any = None      # R8 coverage pass
        self._atlas_render_pass: Any = None    # R16F atlas pass
        self._occ_sampler: Any = None          # nearest-clamp (S2 reads mask)
        self._atlas_sampler: Any = None        # linear, wrap-u/clamp-v (S3 point)
        self._atlas_clamp_sampler: Any = None  # linear, clamp-u/clamp-v (S3 directional)
        self._shadow_set_layout: Any = None    # set 1 (2 combined image samplers: wrap + clamp)
        self._occ_set_layout: Any = None       # S2 set 0 (single combined image sampler)
        self._shadow_pool: Any = None
        self._shadow_dummy_image: Any = None
        self._shadow_dummy_memory: Any = None
        self._shadow_dummy_view: Any = None
        self._shadow_dummy_set: Any = None     # bound when no shadow this frame
        self._shadow_atlas_set: Any = None     # points at the live atlas view (set 1)
        self._occ_set: Any = None              # points at the occ mask (S2 set 0)
        # Lazy occ mask (R8, screen-sized).
        self._occ_image: Any = None
        self._occ_memory: Any = None
        self._occ_view: Any = None
        self._occ_framebuffer: Any = None
        self._occ_w = 0
        self._occ_h = 0
        # Lazy shadow atlas (R16F, W x MAX_SHADOW_LIGHTS; screen-independent).
        self._atlas_image: Any = None
        self._atlas_memory: Any = None
        self._atlas_view: Any = None
        self._atlas_framebuffer: Any = None
        # Grow-on-demand occluder vertex buffer (host-visible vec2 triangles).
        self._occ_vb: Any = None
        self._occ_vb_memory: Any = None
        self._occ_vb_capacity = 0
        # Gradient cookie LUTs: a combined-image-sampler descriptor set (set 0,
        # binding 0) is bound before every light. A 1x1 white dummy serves
        # analytic lights; gradient lights upload a 1-row R8 LUT and update one
        # of MAX_GRADIENT_SETS descriptor sets, cached by LUT identity so an
        # unchanging gradient uploads once.
        self._grad_set_layout: Any = None
        self._grad_pool: Any = None
        self._grad_sampler: Any = None
        self._grad_dummy_image: Any = None
        self._grad_dummy_memory: Any = None
        self._grad_dummy_view: Any = None
        self._grad_dummy_set: Any = None
        # Pre-allocated gradient descriptor sets (reused every frame).
        self._grad_sets: list[Any] = []
        self._grad_next_set = 0   # cursor into _grad_sets for the current frame
        # Per-frame: content-key -> descriptor set (dedupes identical LUTs in a
        # frame). ``_grad_live`` holds this frame's uploaded (image, memory,
        # view) for destruction once the frame's GPU work has retired.
        self._grad_frame_cache: dict[bytes, Any] = {}
        self._grad_live: list[tuple[Any, Any, Any]] = []
        self._grad_retiring: list[list[tuple[Any, Any, Any]]] = []
        self._ready = False


[docs]
    def setup(self) -> None:
        """Create GPU resources: shaders, pipeline, render target."""
        e = self._engine
        device = e.ctx.device
        phys = e.ctx.physical_device
        w, h = e.extent

        # Compile light2d shaders
        vert_spv = compile_shader(SHADER_DIR / "light2d.vert")
        frag_spv = compile_shader(SHADER_DIR / "light2d.frag")
        self._vert_module = create_shader_module(device, vert_spv)
        self._frag_module = create_shader_module(device, frag_spv)

        # Create offscreen render target (additive accumulation)
        self._create_render_target(device, phys, w, h)

        # Gradient-cookie descriptor infrastructure (must exist before the
        # pipeline layout references the set layout).
        self._setup_gradient_resources(device, phys, e)

        # Shadow (S1/S2/S3) infra: shaders, passes, pipelines, samplers, set-1
        # layout/pool + 1x1 dummy atlas. Must exist before the disk pipeline
        # layout references the shadow set layout.
        self._setup_shadow_resources(device, phys, e)

        # Create pipeline with additive blending (set 0 = gradient cookie,
        # set 1 = shadow atlas).
        self._pipeline, self._pipeline_layout = _create_light2d_pipeline(
            device,
            self._vert_module,
            self._frag_module,
            self._rt_render_pass,
            (w, h),
            self._grad_set_layout,
            self._shadow_set_layout,
        )

        # Sampler for reading the accumulation texture
        sampler_info = vk.VkSamplerCreateInfo(
            magFilter=vk.VK_FILTER_LINEAR,
            minFilter=vk.VK_FILTER_LINEAR,
            addressModeU=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeV=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeW=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            anisotropyEnable=vk.VK_FALSE,
            unnormalizedCoordinates=vk.VK_FALSE,
            compareEnable=vk.VK_FALSE,
            mipmapMode=vk.VK_SAMPLER_MIPMAP_MODE_LINEAR,
        )
        self._rt_sampler = vk.vkCreateSampler(device, sampler_info, None)

        # Transition the RT to SHADER_READ_ONLY_OPTIMAL so Draw2DPass can sample
        # it on the first frame before any light is ever submitted. The render
        # pass flips the layout to COLOR_ATTACHMENT_OPTIMAL on begin and back to
        # SHADER_READ_ONLY_OPTIMAL on end, so we only need to seed the initial
        # state here.
        from ..gpu.memory import transition_image_layout
        transition_image_layout(
            device, e.ctx.graphics_queue, e.ctx.command_pool,
            self._rt_image,
            vk.VK_IMAGE_LAYOUT_UNDEFINED,
            vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
        )

        self._ready = True
        log.debug("Light2DPass setup complete (%dx%d)", w, h)


    def _setup_gradient_resources(self, device: Any, phys: Any, e: Any) -> None:
        """Create the gradient-cookie descriptor set layout, pool, sampler, and
        the 1x1 white dummy LUT bound for analytic (non-gradient) lights."""
        # Descriptor set layout: one combined image sampler at binding 0.
        binding = vk.VkDescriptorSetLayoutBinding(
            binding=0,
            descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
            descriptorCount=1,
            stageFlags=vk.VK_SHADER_STAGE_FRAGMENT_BIT,
        )
        self._grad_set_layout = vk.vkCreateDescriptorSetLayout(
            device, vk.VkDescriptorSetLayoutCreateInfo(bindingCount=1, pBindings=[binding]), None,
        )

        # Pool: dummy set + MAX_GRADIENT_SETS gradient sets.
        total = MAX_GRADIENT_SETS + 1
        self._grad_pool = vk.vkCreateDescriptorPool(device, vk.VkDescriptorPoolCreateInfo(
            maxSets=total,
            poolSizeCount=1,
            pPoolSizes=[vk.VkDescriptorPoolSize(
                type=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, descriptorCount=total)],
        ), None)
        sets = list(vk.vkAllocateDescriptorSets(device, vk.VkDescriptorSetAllocateInfo(
            descriptorPool=self._grad_pool,
            descriptorSetCount=total,
            pSetLayouts=[self._grad_set_layout] * total,
        )))
        self._grad_dummy_set = sets[0]
        self._grad_sets = sets[1:]

        # Linear sampler so the LUT interpolates; clamp at both ends.
        self._grad_sampler = vk.vkCreateSampler(device, vk.VkSamplerCreateInfo(
            magFilter=vk.VK_FILTER_LINEAR, minFilter=vk.VK_FILTER_LINEAR,
            addressModeU=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeV=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeW=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            anisotropyEnable=vk.VK_FALSE, unnormalizedCoordinates=vk.VK_FALSE,
            compareEnable=vk.VK_FALSE, mipmapMode=vk.VK_SAMPLER_MIPMAP_MODE_LINEAR,
        ), None)

        # 1x1 white dummy LUT (analytic lights bind this; the shader's
        # has_gradient flag skips the sample, so the value is irrelevant).
        white = np.array([[255]], dtype=np.uint8)
        self._grad_dummy_image, self._grad_dummy_memory = upload_image_data(
            device, phys, e.ctx.graphics_queue, e.ctx.command_pool,
            white, 1, 1, fmt=vk.VK_FORMAT_R8_UNORM,
        )
        self._grad_dummy_view = _r8_view(device, self._grad_dummy_image)
        self._write_grad_set(device, self._grad_dummy_set, self._grad_dummy_view)

    def _write_grad_set(self, device: Any, dset: Any, view: Any) -> None:
        """Point a gradient descriptor set at ``view`` (combined image sampler)."""
        info = vk.VkDescriptorImageInfo(
            sampler=self._grad_sampler, imageView=view,
            imageLayout=vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
        )
        vk.vkUpdateDescriptorSets(device, 1, [vk.VkWriteDescriptorSet(
            dstSet=dset, dstBinding=0, dstArrayElement=0, descriptorCount=1,
            descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, pImageInfo=[info],
        )], 0, None)

    # ---- Shadow (S1/S2/S3) --------------------------------------------------
    def _setup_shadow_resources(self, device: Any, phys: Any, e: Any) -> None:
        """Create persistent shadow infra: shaders, render passes, pipelines,
        samplers, the set-1 descriptor layout/pool and a 1x1 dummy atlas.

        Screen-sized / per-frame resources (occ mask, atlas, framebuffers, the
        occluder VB) are allocated lazily in :meth:`_ensure_shadow_targets` only
        when a frame actually casts a shadow.
        """
        # Shaders.
        self._occ_vert_module = create_shader_module(device, compile_shader(SHADER_DIR / "light2d_occluder.vert"))
        self._occ_frag_module = create_shader_module(device, compile_shader(SHADER_DIR / "light2d_occluder.frag"))
        self._sm_vert_module = create_shader_module(device, compile_shader(SHADER_DIR / "light2d_shadowmap.vert"))
        self._sm_frag_module = create_shader_module(device, compile_shader(SHADER_DIR / "light2d_shadowmap.frag"))

        # Colour-only offscreen passes ending SHADER_READ_ONLY_OPTIMAL.
        self._occ_render_pass = create_offscreen_pass(device, vk.VK_FORMAT_R8_UNORM, depth_format=0)
        self._atlas_render_pass = create_offscreen_pass(device, vk.VK_FORMAT_R16_SFLOAT, depth_format=0)

        # occ_mask read (S2): nearest + clamp keeps coverage binary at edges.
        self._occ_sampler = vk.vkCreateSampler(device, vk.VkSamplerCreateInfo(
            magFilter=vk.VK_FILTER_NEAREST, minFilter=vk.VK_FILTER_NEAREST,
            addressModeU=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeV=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeW=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            anisotropyEnable=vk.VK_FALSE, unnormalizedCoordinates=vk.VK_FALSE,
            compareEnable=vk.VK_FALSE, mipmapMode=vk.VK_SAMPLER_MIPMAP_MODE_NEAREST,
        ), None)
        # atlas read (S3): LINEAR on both axes, CLAMP on the row (v) axis. The u
        # (column) axis differs by light type per the frozen spec: POINT rows
        # sample with WRAP (angular repeat), DIRECTIONAL rows with CLAMP. Two
        # samplers wrap the SAME atlas image; the disk shader selects by
        # light_type (set 1 binding 0 = wrap, binding 1 = clamp).
        self._atlas_sampler = vk.vkCreateSampler(device, vk.VkSamplerCreateInfo(
            magFilter=vk.VK_FILTER_LINEAR, minFilter=vk.VK_FILTER_LINEAR,
            addressModeU=vk.VK_SAMPLER_ADDRESS_MODE_REPEAT,
            addressModeV=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeW=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            anisotropyEnable=vk.VK_FALSE, unnormalizedCoordinates=vk.VK_FALSE,
            compareEnable=vk.VK_FALSE, mipmapMode=vk.VK_SAMPLER_MIPMAP_MODE_LINEAR,
        ), None)
        self._atlas_clamp_sampler = vk.vkCreateSampler(device, vk.VkSamplerCreateInfo(
            magFilter=vk.VK_FILTER_LINEAR, minFilter=vk.VK_FILTER_LINEAR,
            addressModeU=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeV=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            addressModeW=vk.VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
            anisotropyEnable=vk.VK_FALSE, unnormalizedCoordinates=vk.VK_FALSE,
            compareEnable=vk.VK_FALSE, mipmapMode=vk.VK_SAMPLER_MIPMAP_MODE_LINEAR,
        ), None)

        # Set-1 layout: TWO combined image samplers (binding 0 = wrap-u for point,
        # binding 1 = clamp-u for directional), both wrapping the atlas image.
        self._shadow_set_layout = vk.vkCreateDescriptorSetLayout(
            device, vk.VkDescriptorSetLayoutCreateInfo(bindingCount=2, pBindings=[
                vk.VkDescriptorSetLayoutBinding(
                    binding=0, descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
                    descriptorCount=1, stageFlags=vk.VK_SHADER_STAGE_FRAGMENT_BIT),
                vk.VkDescriptorSetLayoutBinding(
                    binding=1, descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
                    descriptorCount=1, stageFlags=vk.VK_SHADER_STAGE_FRAGMENT_BIT),
            ]), None,
        )
        # S2 set-0 layout: a single combined image sampler (the occ mask).
        self._occ_set_layout = vk.vkCreateDescriptorSetLayout(
            device, vk.VkDescriptorSetLayoutCreateInfo(bindingCount=1, pBindings=[
                vk.VkDescriptorSetLayoutBinding(
                    binding=0, descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
                    descriptorCount=1, stageFlags=vk.VK_SHADER_STAGE_FRAGMENT_BIT)],
            ), None,
        )
        # Three sets: the set-1 dummy (2 samplers), the set-1 live atlas (2), and
        # the S2 occ mask (1). Total 5 combined-image-sampler descriptors.
        self._shadow_pool = vk.vkCreateDescriptorPool(device, vk.VkDescriptorPoolCreateInfo(
            maxSets=3, poolSizeCount=1,
            pPoolSizes=[vk.VkDescriptorPoolSize(
                type=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, descriptorCount=5)],
        ), None)
        dummy_set, atlas_set = list(vk.vkAllocateDescriptorSets(device, vk.VkDescriptorSetAllocateInfo(
            descriptorPool=self._shadow_pool, descriptorSetCount=2,
            pSetLayouts=[self._shadow_set_layout] * 2,
        )))
        occ_set = list(vk.vkAllocateDescriptorSets(device, vk.VkDescriptorSetAllocateInfo(
            descriptorPool=self._shadow_pool, descriptorSetCount=1,
            pSetLayouts=[self._occ_set_layout],
        )))[0]
        self._shadow_dummy_set = dummy_set
        self._shadow_atlas_set = atlas_set
        self._occ_set = occ_set

        # 1x1 R16F dummy atlas (bound to set 1 when no light casts a shadow, so
        # the disk pipeline's set-1 binding is always valid).
        dummy = np.ones((1, 1), dtype=np.float16)
        self._shadow_dummy_image, self._shadow_dummy_memory = upload_image_data(
            device, phys, e.ctx.graphics_queue, e.ctx.command_pool,
            dummy, 1, 1, fmt=vk.VK_FORMAT_R16_SFLOAT,
        )
        self._shadow_dummy_view = _r16f_view(device, self._shadow_dummy_image)
        self._write_shadow_set(device, self._shadow_dummy_set, self._shadow_dummy_view)

        # Occluder pipeline (vec2 vertex input) + shadow-map build pipeline.
        self._occ_pipeline, self._occ_pipeline_layout = _create_occluder_pipeline(
            device, self._occ_vert_module, self._occ_frag_module, self._occ_render_pass,
        )
        self._sm_pipeline, self._sm_pipeline_layout = _create_shadowmap_pipeline(
            device, self._sm_vert_module, self._sm_frag_module, self._atlas_render_pass,
            self._occ_set_layout,
        )

    def _write_shadow_set(self, device: Any, dset: Any, view: Any) -> None:
        """Point set 1's two combined image samplers at ``view``: binding 0 via the
        wrap-u sampler (point), binding 1 via the clamp-u sampler (directional)."""
        vk.vkUpdateDescriptorSets(device, 2, [
            vk.VkWriteDescriptorSet(
                dstSet=dset, dstBinding=0, dstArrayElement=0, descriptorCount=1,
                descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
                pImageInfo=[vk.VkDescriptorImageInfo(
                    sampler=self._atlas_sampler, imageView=view,
                    imageLayout=vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)],
            ),
            vk.VkWriteDescriptorSet(
                dstSet=dset, dstBinding=1, dstArrayElement=0, descriptorCount=1,
                descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
                pImageInfo=[vk.VkDescriptorImageInfo(
                    sampler=self._atlas_clamp_sampler, imageView=view,
                    imageLayout=vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)],
            ),
        ], 0, None)

    def _ensure_shadow_targets(self, device: Any, phys: Any, w: int, h: int) -> None:
        """Allocate the occ mask (screen-sized) + atlas (screen-independent) +
        their framebuffers on demand. The occ mask is recreated on resize; the
        atlas is created once."""
        if self._occ_image is not None and (self._occ_w != w or self._occ_h != h):
            self._destroy_occ_target(device)
        if self._occ_image is None:
            self._occ_w, self._occ_h = w, h
            self._occ_image, self._occ_memory = create_image(
                device, phys, w, h, vk.VK_FORMAT_R8_UNORM,
                vk.VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | vk.VK_IMAGE_USAGE_SAMPLED_BIT,
            )
            self._occ_view = _r8_view(device, self._occ_image)
            self._occ_framebuffer = vk.vkCreateFramebuffer(device, vk.VkFramebufferCreateInfo(
                renderPass=self._occ_render_pass, attachmentCount=1, pAttachments=[self._occ_view],
                width=w, height=h, layers=1,
            ), None)
            # Point the S2 occ-mask descriptor (set 0) at the new view via the
            # nearest-clamp sampler.
            vk.vkUpdateDescriptorSets(device, 1, [vk.VkWriteDescriptorSet(
                dstSet=self._occ_set, dstBinding=0, dstArrayElement=0, descriptorCount=1,
                descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
                pImageInfo=[vk.VkDescriptorImageInfo(
                    sampler=self._occ_sampler, imageView=self._occ_view,
                    imageLayout=vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL)],
            )], 0, None)
        if self._atlas_image is None:
            self._atlas_image, self._atlas_memory = create_image(
                device, phys, SHADOW_MAP_RESOLUTION, MAX_SHADOW_LIGHTS, vk.VK_FORMAT_R16_SFLOAT,
                vk.VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | vk.VK_IMAGE_USAGE_SAMPLED_BIT,
            )
            self._atlas_view = _r16f_view(device, self._atlas_image)
            self._atlas_framebuffer = vk.vkCreateFramebuffer(device, vk.VkFramebufferCreateInfo(
                renderPass=self._atlas_render_pass, attachmentCount=1, pAttachments=[self._atlas_view],
                width=SHADOW_MAP_RESOLUTION, height=MAX_SHADOW_LIGHTS, layers=1,
            ), None)
            self._write_shadow_set(device, self._shadow_atlas_set, self._atlas_view)

    def _upload_occluders(self, device: Any, phys: Any, tris: np.ndarray) -> None:
        """Grow the host-visible occluder VB as needed and upload ``tris``
        (float32 ``(V, 2)`` triangle-list, pixel space)."""
        needed = tris.nbytes
        if needed > self._occ_vb_capacity:
            if self._occ_vb:
                vk.vkDestroyBuffer(device, self._occ_vb, None)
                vk.vkFreeMemory(device, self._occ_vb_memory, None)
            new_cap = max(needed, 4096)
            self._occ_vb, self._occ_vb_memory = create_buffer(
                device, phys, new_cap, vk.VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
                vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
            )
            self._occ_vb_capacity = new_cap
        upload_numpy(device, self._occ_vb_memory, tris)

    def _gradient_descriptor(self, lut: np.ndarray) -> Any:
        """Return a descriptor set bound to ``lut`` for THIS frame (uploaded as a
        1-row R8 texture), or the dummy set if the per-frame set budget is
        exhausted. Identical LUTs in the same frame share one upload."""
        pixels = np.ascontiguousarray((np.clip(lut, 0.0, 1.0) * 255.0).astype(np.uint8)).reshape(1, -1)
        key = pixels.tobytes()
        cached = self._grad_frame_cache.get(key)
        if cached is not None:
            return cached
        if self._grad_next_set >= len(self._grad_sets):
            return self._grad_dummy_set
        e = self._engine
        device = e.ctx.device
        width = pixels.shape[1]
        image, memory = upload_image_data(
            device, e.ctx.physical_device, e.ctx.graphics_queue, e.ctx.command_pool,
            pixels, width, 1, fmt=vk.VK_FORMAT_R8_UNORM,
        )
        view = _r8_view(device, image)
        dset = self._grad_sets[self._grad_next_set]
        self._grad_next_set += 1
        self._write_grad_set(device, dset, view)
        self._grad_frame_cache[key] = dset
        self._grad_live.append((image, memory, view))
        return dset

    def _rotate_gradient_frame(self) -> None:
        """Advance the gradient double-buffer: destroy textures uploaded two
        frames back (safely retired), then arm a fresh per-frame slot."""
        device = self._engine.ctx.device
        # _grad_retiring[0] is two frames old: safe to destroy now.
        while len(self._grad_retiring) >= 2:
            self._destroy_grad_textures(device, self._grad_retiring.pop(0))
        # This frame's prior uploads become "retiring".
        if self._grad_live:
            self._grad_retiring.append(self._grad_live)
            self._grad_live = []
        self._grad_frame_cache = {}
        self._grad_next_set = 0

    @staticmethod
    def _destroy_grad_textures(device: Any, items: list[tuple[Any, Any, Any]]) -> None:
        for image, memory, view in items:
            vk.vkDestroyImageView(device, view, None)
            vk.vkDestroyImage(device, image, None)
            vk.vkFreeMemory(device, memory, None)

    def _create_render_target(self, device: Any, phys: Any, w: int, h: int) -> None:
        """Create the offscreen render target for light accumulation."""
        self._rt_width = w
        self._rt_height = h
        fmt = vk.VK_FORMAT_R16G16B16A16_SFLOAT  # HDR accumulation

        self._rt_image, self._rt_memory = create_image(
            device,
            phys,
            w,
            h,
            fmt,
            vk.VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | vk.VK_IMAGE_USAGE_SAMPLED_BIT,
        )
        self._rt_view = vk.vkCreateImageView(
            device,
            vk.VkImageViewCreateInfo(
                image=self._rt_image,
                viewType=vk.VK_IMAGE_VIEW_TYPE_2D,
                format=fmt,
                subresourceRange=vk.VkImageSubresourceRange(
                    aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
                    baseMipLevel=0,
                    levelCount=1,
                    baseArrayLayer=0,
                    layerCount=1,
                ),
            ),
            None,
        )

        # Render pass: clear to black, store result as shader-readable
        self._rt_render_pass = create_offscreen_pass(device, fmt, depth_format=0)

        self._rt_framebuffer = vk.vkCreateFramebuffer(
            device,
            vk.VkFramebufferCreateInfo(
                renderPass=self._rt_render_pass,
                attachmentCount=1,
                pAttachments=[self._rt_view],
                width=w,
                height=h,
                layers=1,
            ),
            None,
        )


[docs]
    def submit_light(
        self,
        position: tuple[float, float],
        colour: tuple[float, float, float],
        energy: float,
        light_range: float,
        falloff: float = 1.0,
        inner_radius: float = 0.0,
        falloff_gradient: Any = None,
        blend_mode: str = "add",
        shadow_enabled: bool = False,
        shadow_colour: tuple[float, ...] = DEFAULT_SHADOW_COLOUR,
        shadow_softness: float = 1.0,
        light_type: str = "point",
        direction: tuple[float, float] = (0.0, -1.0),
    ) -> None:
        """Queue a light for rendering this frame.

        ``falloff_gradient`` is an optional 1-D intensity LUT (float array in
        ``[0, 1]``) that overrides the analytic falloff; it is uploaded as a
        1-row texture and sampled by the remapped distance.

        ``light_type`` / ``direction`` describe the light for shadow casting.
        ``"point"`` builds/samples a radial shadow map; ``"directional"`` builds
        an orthographic 1D map along ``perp(direction)`` and renders a flat
        fullscreen contribution.
        """
        grad = None
        if falloff_gradient is not None:
            grad = np.ascontiguousarray(np.asarray(falloff_gradient, dtype=np.float32).reshape(-1))
        self._lights.append(
            {
                "position": position,
                "colour": colour,
                "energy": energy,
                "range": light_range,
                "falloff": falloff,
                "inner_radius": inner_radius,
                "falloff_gradient": grad,
                "blend_mode": blend_mode,
                "shadow_enabled": shadow_enabled,
                "shadow_colour": shadow_colour,
                "shadow_softness": shadow_softness,
                "light_type": light_type,
                "direction": direction,
            }
        )



[docs]
    def submit_occluder(self, polygon_vertices: list[tuple[float, float]]) -> None:
        """Queue an occluder polygon for shadow casting this frame."""
        if len(polygon_vertices) >= 2:
            self._occluders.append(polygon_vertices)



[docs]
    def begin_frame(self) -> None:
        """Clear per-frame submission lists."""
        self._lights.clear()
        self._occluders.clear()



[docs]
    def render(
        self,
        cmd: Any,
        extent: tuple[int, int],
        lights: list[dict] | None = None,
        occluders: list[list[tuple[float, float]]] | None = None,
    ) -> None:
        """Render all queued lights to the accumulation render target.

        Must be called outside the main render pass (in pre_render phase).

        ``lights`` / ``occluders`` default to the live ``self._lights`` /
        ``self._occluders`` (the synchronous path). In pipelined mode the render
        thread passes the packet's OWNED snapshots so it never reads the live
        lists the main thread is concurrently rebuilding.
        """
        lights = self._lights if lights is None else lights
        occluders = self._occluders if occluders is None else occluders
        if not self._ready or not lights:
            return

        # Rotate gradient-cookie state for this frame: retire the textures that
        # were live two frames ago (their GPU work has retired by now), reset the
        # per-frame dedup cache + set cursor, and queue this frame's uploads for
        # retirement. Keeping one frame of slack avoids destroying a texture the
        # GPU may still be sampling.
        self._rotate_gradient_frame()

        w, h = extent

        # Resize render target if window changed
        if w != self._rt_width or h != self._rt_height:
            self._destroy_render_target()
            self._create_render_target(
                self._engine.ctx.device,
                self._engine.ctx.physical_device,
                w,
                h,
            )
            # Recreate pipeline for new extent
            device = self._engine.ctx.device
            if self._pipeline:
                vk.vkDestroyPipeline(device, self._pipeline, None)
            if self._pipeline_layout:
                vk.vkDestroyPipelineLayout(device, self._pipeline_layout, None)
            self._pipeline, self._pipeline_layout = _create_light2d_pipeline(
                device,
                self._vert_module,
                self._frag_module,
                self._rt_render_pass,
                (w, h),
                self._grad_set_layout,
                self._shadow_set_layout,
            )

        # --- Shadow build (S1 + S2). Assign atlas rows to the first
        # MAX_SHADOW_LIGHTS shadow-casting lights; anything past that (or when no
        # occluders exist) gets shadow_row = -1 and is byte-identical to the
        # pre-shadow path. Only allocate GPU targets when work actually exists. ---
        shadow_rows = self._build_shadow_atlas(cmd, lights, occluders, w, h)

        # Bind set 1 (the shadow atlas, or the 1x1 dummy when nothing casts a
        # shadow this frame) once for the whole disk pass.
        has_shadow = any(r >= 0 for r in shadow_rows)
        shadow_set = self._shadow_atlas_set if has_shadow else self._shadow_dummy_set

        # Begin offscreen render pass (clears to black)
        clear_value = vk.VkClearValue(
            color=vk.VkClearColorValue(float32=[0.0, 0.0, 0.0, 0.0]),
        )
        begin_info = vk.VkRenderPassBeginInfo(
            renderPass=self._rt_render_pass,
            framebuffer=self._rt_framebuffer,
            renderArea=vk.VkRect2D(
                offset=vk.VkOffset2D(x=0, y=0),
                extent=vk.VkExtent2D(width=w, height=h),
            ),
            clearValueCount=1,
            pClearValues=[clear_value],
        )
        vk.vkCmdBeginRenderPass(cmd, begin_info, vk.VK_SUBPASS_CONTENTS_INLINE)

        vk_viewport = vk.VkViewport(
            x=0.0,
            y=0.0,
            width=float(w),
            height=float(h),
            minDepth=0.0,
            maxDepth=1.0,
        )
        scissor = vk.VkRect2D(
            offset=vk.VkOffset2D(x=0, y=0),
            extent=vk.VkExtent2D(width=w, height=h),
        )

        vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._pipeline)
        vk.vkCmdSetViewport(cmd, 0, 1, [vk_viewport])
        vk.vkCmdSetScissor(cmd, 0, 1, [scissor])
        # Set 1 (shadow atlas) is constant for the whole disk pass.
        vk.vkCmdBindDescriptorSets(
            cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._pipeline_layout,
            1, 1, [shadow_set], 0, None,
        )

        # Render each light as a fullscreen quad with push constants
        for light, row in zip(lights, shadow_rows, strict=True):
            px, py = light["position"]
            r, g, b = light["colour"]
            energy = light["energy"]
            lr = light["range"]
            falloff = light.get("falloff", 1.0)
            inner_radius = light.get("inner_radius", 0.0)

            # Gradient cookie: bind its descriptor set (or the dummy) and signal
            # the shader via light_colour.a (1 = sample LUT, 0 = analytic curve).
            lut = light.get("falloff_gradient")
            if lut is not None and getattr(lut, "shape", (0,))[0] >= 1:
                dset = self._gradient_descriptor(lut)
                has_gradient = 1.0 if dset is not self._grad_dummy_set else 0.0
            else:
                dset = self._grad_dummy_set
                has_gradient = 0.0
            vk.vkCmdBindDescriptorSets(
                cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._pipeline_layout,
                0, 1, [dset], 0, None,
            )

            # Shadow (S3) fields: row < 0 => skip; shadow_colour a = strength,
            # rgb = residual tint (frozen fold).
            sc = light.get("shadow_colour", DEFAULT_SHADOW_COLOUR)
            strength = float(sc[3]) if len(sc) >= 4 else 0.5
            tint_r, tint_g, tint_b = (float(sc[0]), float(sc[1]), float(sc[2])) if len(sc) >= 3 else (0.0, 0.0, 0.0)
            softness = float(light.get("shadow_softness", 1.0))
            is_dir = light.get("light_type", "point") == "directional"
            dx, dy = light.get("direction", (0.0, -1.0))

            # Pack push constants: must match LightPush struct layout (88 bytes).
            push_data = struct.pack(
                "ff ff ffff ffff ffff ffff ff",
                float(w), float(h),            # screen_size
                px, py,                        # light_pos
                lr, falloff, energy, inner_radius,
                r, g, b, has_gradient,         # light_colour (a = has_gradient)
                float(row), strength, softness, 1.0 if is_dir else 0.0,  # shadow_row/strength/softness/light_type
                tint_r, tint_g, tint_b, 0.0,   # shadow_tint (rgb)
                float(dx), float(dy),          # light_dir (directional)
            )
            self._engine.push_constants(cmd, self._pipeline_layout, push_data)
            vk.vkCmdDraw(cmd, 6, 1, 0, 0)  # 6 verts = fullscreen quad

        vk.vkCmdEndRenderPass(cmd)


    def _build_shadow_atlas(
        self,
        cmd: Any,
        lights: list[dict],
        occluders: list[list[tuple[float, float]]],
        w: int,
        h: int,
    ) -> list[int]:
        """Run S1 (occluder coverage) + S2 (1D atlas build) for this frame.

        Returns a list parallel to ``lights`` giving each light's atlas row, or
        ``-1`` when it casts no shadow (disabled, no occluders, or past
        ``MAX_SHADOW_LIGHTS``). When no row is assigned no GPU target is
        allocated and the returned list is all ``-1`` (byte-identical fast path).
        """
        rows: list[int] = []
        next_row = 0
        shadow_lights: list[tuple[int, dict]] = []   # (row, light)
        have_occ = bool(occluders)
        for light in lights:
            # Both POINT and DIRECTIONAL shadow-casting lights get an atlas row;
            # the S2 build + S3 sample branch on light_type.
            if (have_occ and light.get("shadow_enabled")
                    and light.get("light_type", "point") in ("point", "directional")
                    and next_row < MAX_SHADOW_LIGHTS):
                rows.append(next_row)
                shadow_lights.append((next_row, light))
                next_row += 1
            else:
                rows.append(-1)

        if not shadow_lights:
            return rows  # all -1: skip S1/S2, allocate nothing

        device = self._engine.ctx.device
        phys = self._engine.ctx.physical_device
        self._ensure_shadow_targets(device, phys, w, h)

        # Fan-triangulate every occluder polygon into one (V, 2) pixel-space
        # triangle list.
        tri: list[tuple[float, float]] = []
        for poly in occluders:
            if len(poly) < 3:
                continue
            v0 = poly[0]
            for i in range(1, len(poly) - 1):
                tri.append(v0)
                tri.append(poly[i])
                tri.append(poly[i + 1])
        if not tri:
            # Occluders present but all degenerate: nothing to rasterise. Clear
            # the atlas to "fully lit" so every sampled row reads 1.0.
            self._clear_atlas_lit(cmd)
            return rows
        tris = np.ascontiguousarray(np.asarray(tri, dtype=np.float32).reshape(-1, 2))
        self._upload_occluders(device, phys, tris)

        # --- S1: rasterise occluder triangles into the R8 coverage mask. ---
        self._run_occluder_pass(cmd, w, h, len(tris))
        _shader_read_barrier(cmd, self._occ_image)   # mask visible to S2

        # --- S2: build each shadow light's atlas row. ---
        self._run_atlas_pass(cmd, w, h, shadow_lights)
        _shader_read_barrier(cmd, self._atlas_image)  # atlas visible to S3
        return rows

    def _run_occluder_pass(self, cmd: Any, w: int, h: int, vertex_count: int) -> None:
        """S1: clear the R8 mask to 0 and rasterise all occluder triangles to 1."""
        clear = vk.VkClearValue(color=vk.VkClearColorValue(float32=[0.0, 0.0, 0.0, 0.0]))
        vk.vkCmdBeginRenderPass(cmd, vk.VkRenderPassBeginInfo(
            renderPass=self._occ_render_pass, framebuffer=self._occ_framebuffer,
            renderArea=vk.VkRect2D(offset=vk.VkOffset2D(x=0, y=0),
                                   extent=vk.VkExtent2D(width=w, height=h)),
            clearValueCount=1, pClearValues=[clear],
        ), vk.VK_SUBPASS_CONTENTS_INLINE)
        vk.vkCmdSetViewport(cmd, 0, 1, [vk.VkViewport(
            x=0.0, y=0.0, width=float(w), height=float(h), minDepth=0.0, maxDepth=1.0)])
        vk.vkCmdSetScissor(cmd, 0, 1, [vk.VkRect2D(
            offset=vk.VkOffset2D(x=0, y=0), extent=vk.VkExtent2D(width=w, height=h))])
        vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._occ_pipeline)
        vk.vkCmdBindVertexBuffers(cmd, 0, 1, [self._occ_vb], [0])
        self._engine.push_constants(cmd, self._occ_pipeline_layout, struct.pack("ff", float(w), float(h)))
        vk.vkCmdDraw(cmd, vertex_count, 1, 0, 0)
        vk.vkCmdEndRenderPass(cmd)

    def _run_atlas_pass(self, cmd: Any, w: int, h: int, shadow_lights: list[tuple[int, dict]]) -> None:
        """S2: for each shadow light, ray-march the occ mask into its atlas row
        (a 1-px-tall viewport strip of the R16F atlas)."""
        clear = vk.VkClearValue(color=vk.VkClearColorValue(float32=[1.0, 1.0, 1.0, 1.0]))
        vk.vkCmdBeginRenderPass(cmd, vk.VkRenderPassBeginInfo(
            renderPass=self._atlas_render_pass, framebuffer=self._atlas_framebuffer,
            renderArea=vk.VkRect2D(offset=vk.VkOffset2D(x=0, y=0),
                                   extent=vk.VkExtent2D(width=SHADOW_MAP_RESOLUTION, height=MAX_SHADOW_LIGHTS)),
            clearValueCount=1, pClearValues=[clear],
        ), vk.VK_SUBPASS_CONTENTS_INLINE)
        vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._sm_pipeline)
        vk.vkCmdBindDescriptorSets(   # S2 samples the occ mask (set 0, binding 0)
            cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._sm_pipeline_layout,
            0, 1, [self._occ_set], 0, None,
        )
        for row, light in shadow_lights:
            vk.vkCmdSetViewport(cmd, 0, 1, [vk.VkViewport(
                x=0.0, y=float(row), width=float(SHADOW_MAP_RESOLUTION), height=1.0,
                minDepth=0.0, maxDepth=1.0)])
            vk.vkCmdSetScissor(cmd, 0, 1, [vk.VkRect2D(
                offset=vk.VkOffset2D(x=0, y=row),
                extent=vk.VkExtent2D(width=SHADOW_MAP_RESOLUTION, height=1))])
            px, py = light["position"]
            is_dir = light.get("light_type", "point") == "directional"
            dx, dy = light.get("direction", (0.0, -1.0))
            # SMPush: screen_size, light_pos, direction, range, light_type (32 B).
            self._engine.push_constants(cmd, self._sm_pipeline_layout, struct.pack(
                "ff ff ff f f",
                float(w), float(h), float(px), float(py),
                float(dx), float(dy), float(light["range"]), 1.0 if is_dir else 0.0))
            vk.vkCmdDraw(cmd, 6, 1, 0, 0)
        vk.vkCmdEndRenderPass(cmd)

    def _clear_atlas_lit(self, cmd: Any) -> None:
        """Begin+end the atlas pass with no draws so every column clears to the
        fully-lit value (1.0). Used when occluders are all degenerate."""
        clear = vk.VkClearValue(color=vk.VkClearColorValue(float32=[1.0, 1.0, 1.0, 1.0]))
        vk.vkCmdBeginRenderPass(cmd, vk.VkRenderPassBeginInfo(
            renderPass=self._atlas_render_pass, framebuffer=self._atlas_framebuffer,
            renderArea=vk.VkRect2D(offset=vk.VkOffset2D(x=0, y=0),
                                   extent=vk.VkExtent2D(width=SHADOW_MAP_RESOLUTION, height=MAX_SHADOW_LIGHTS)),
            clearValueCount=1, pClearValues=[clear],
        ), vk.VK_SUBPASS_CONTENTS_INLINE)
        vk.vkCmdEndRenderPass(cmd)
        _shader_read_barrier(cmd, self._atlas_image)


[docs]
    def get_light_texture_view(self) -> Any:
        """Return the light accumulation image view for compositing."""
        return self._rt_view



[docs]
    def get_light_sampler(self) -> Any:
        """Return the sampler for the light accumulation texture."""
        return self._rt_sampler



[docs]
    @property
    def has_lights(self) -> bool:
        """True if any lights were submitted this frame."""
        return len(self._lights) > 0


    def _destroy_render_target(self) -> None:
        """Destroy offscreen RT resources (for resize)."""
        device = self._engine.ctx.device
        if self._rt_framebuffer:
            vk.vkDestroyFramebuffer(device, self._rt_framebuffer, None)
        if self._rt_render_pass:
            vk.vkDestroyRenderPass(device, self._rt_render_pass, None)
        if self._rt_view:
            vk.vkDestroyImageView(device, self._rt_view, None)
        if self._rt_image:
            vk.vkDestroyImage(device, self._rt_image, None)
        if self._rt_memory:
            vk.vkFreeMemory(device, self._rt_memory, None)

    def _destroy_occ_target(self, device: Any) -> None:
        """Destroy the screen-sized occ mask (for resize / cleanup)."""
        if self._occ_framebuffer:
            vk.vkDestroyFramebuffer(device, self._occ_framebuffer, None)
        if self._occ_view:
            vk.vkDestroyImageView(device, self._occ_view, None)
        if self._occ_image:
            vk.vkDestroyImage(device, self._occ_image, None)
        if self._occ_memory:
            vk.vkFreeMemory(device, self._occ_memory, None)
        self._occ_framebuffer = self._occ_view = self._occ_image = self._occ_memory = None
        self._occ_w = self._occ_h = 0

    def _destroy_shadow_resources(self, device: Any) -> None:
        """Destroy all shadow (S1/S2/S3) GPU resources."""
        self._destroy_occ_target(device)
        if self._atlas_framebuffer:
            vk.vkDestroyFramebuffer(device, self._atlas_framebuffer, None)
        if self._atlas_view:
            vk.vkDestroyImageView(device, self._atlas_view, None)
        if self._atlas_image:
            vk.vkDestroyImage(device, self._atlas_image, None)
        if self._atlas_memory:
            vk.vkFreeMemory(device, self._atlas_memory, None)
        if self._occ_vb:
            vk.vkDestroyBuffer(device, self._occ_vb, None)
            vk.vkFreeMemory(device, self._occ_vb_memory, None)
        if self._shadow_dummy_view:
            vk.vkDestroyImageView(device, self._shadow_dummy_view, None)
        if self._shadow_dummy_image:
            vk.vkDestroyImage(device, self._shadow_dummy_image, None)
        if self._shadow_dummy_memory:
            vk.vkFreeMemory(device, self._shadow_dummy_memory, None)
        for obj, fn in [
            (self._occ_pipeline, vk.vkDestroyPipeline),
            (self._occ_pipeline_layout, vk.vkDestroyPipelineLayout),
            (self._sm_pipeline, vk.vkDestroyPipeline),
            (self._sm_pipeline_layout, vk.vkDestroyPipelineLayout),
            (self._occ_vert_module, vk.vkDestroyShaderModule),
            (self._occ_frag_module, vk.vkDestroyShaderModule),
            (self._sm_vert_module, vk.vkDestroyShaderModule),
            (self._sm_frag_module, vk.vkDestroyShaderModule),
            (self._occ_render_pass, vk.vkDestroyRenderPass),
            (self._atlas_render_pass, vk.vkDestroyRenderPass),
            (self._occ_sampler, vk.vkDestroySampler),
            (self._atlas_sampler, vk.vkDestroySampler),
            (self._atlas_clamp_sampler, vk.vkDestroySampler),
            (self._shadow_pool, vk.vkDestroyDescriptorPool),
            (self._shadow_set_layout, vk.vkDestroyDescriptorSetLayout),
            (self._occ_set_layout, vk.vkDestroyDescriptorSetLayout),
        ]:
            if obj:
                fn(device, obj, None)


[docs]
    def cleanup(self) -> None:
        """Destroy all GPU resources."""
        if not self._ready:
            return
        device = self._engine.ctx.device
        self._destroy_render_target()
        self._destroy_shadow_resources(device)
        # Gradient cookie textures (live + retiring), dummy LUT and descriptor
        # infra. Descriptor sets are freed with the pool.
        for batch in [self._grad_live, *self._grad_retiring]:
            self._destroy_grad_textures(device, batch)
        self._grad_live = []
        self._grad_retiring = []
        if self._grad_dummy_view:
            vk.vkDestroyImageView(device, self._grad_dummy_view, None)
        if self._grad_dummy_image:
            vk.vkDestroyImage(device, self._grad_dummy_image, None)
        if self._grad_dummy_memory:
            vk.vkFreeMemory(device, self._grad_dummy_memory, None)
        for obj, fn in [
            (self._pipeline, vk.vkDestroyPipeline),
            (self._pipeline_layout, vk.vkDestroyPipelineLayout),
            (self._vert_module, vk.vkDestroyShaderModule),
            (self._frag_module, vk.vkDestroyShaderModule),
            (self._rt_sampler, vk.vkDestroySampler),
            (self._grad_sampler, vk.vkDestroySampler),
            (self._grad_pool, vk.vkDestroyDescriptorPool),
            (self._grad_set_layout, vk.vkDestroyDescriptorSetLayout),
        ]:
            if obj:
                fn(device, obj, None)
        self._ready = False



def _r8_view(device: Any, image: Any) -> Any:
    """Create a 2D R8_UNORM image view for a gradient LUT texture."""
    return vk.vkCreateImageView(device, vk.VkImageViewCreateInfo(
        image=image,
        viewType=vk.VK_IMAGE_VIEW_TYPE_2D,
        format=vk.VK_FORMAT_R8_UNORM,
        subresourceRange=vk.VkImageSubresourceRange(
            aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
            baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1,
        ),
    ), None)

def _r16f_view(device: Any, image: Any) -> Any:
    """Create a 2D R16_SFLOAT image view (shadow atlas / dummy)."""
    return vk.vkCreateImageView(device, vk.VkImageViewCreateInfo(
        image=image,
        viewType=vk.VK_IMAGE_VIEW_TYPE_2D,
        format=vk.VK_FORMAT_R16_SFLOAT,
        subresourceRange=vk.VkImageSubresourceRange(
            aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
            baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1,
        ),
    ), None)

def _shader_read_barrier(cmd: Any, image: Any) -> None:
    """Barrier making a just-written colour attachment (already transitioned to
    SHADER_READ_ONLY by the render pass) visible to a later fragment-shader read.
    The layouts match (both SHADER_READ_ONLY); this adds the execution + memory
    dependency the render pass's conservative store transition does not."""
    vk.vkCmdPipelineBarrier(
        cmd,
        vk.VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
        vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
        0, 0, None, 0, None, 1,
        [vk.VkImageMemoryBarrier(
            srcAccessMask=vk.VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
            dstAccessMask=vk.VK_ACCESS_SHADER_READ_BIT,
            oldLayout=vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
            newLayout=vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
            srcQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
            dstQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
            image=image,
            subresourceRange=vk.VkImageSubresourceRange(
                aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
                baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1,
            ),
        )],
    )

def _create_light2d_pipeline(
    device: Any,
    vert_module: Any,
    frag_module: Any,
    render_pass: Any,
    extent: tuple[int, int],
    grad_set_layout: Any,
    shadow_set_layout: Any,
) -> tuple[Any, Any]:
    """Create additive-blend pipeline for 2D light rendering (no vertex buffer).

    ``grad_set_layout`` is the gradient-cookie descriptor set layout (set 0);
    ``shadow_set_layout`` is the shadow-atlas set layout (set 1). Both are a
    single combined image sampler the fragment shader samples.
    """
    ffi = vk.ffi

    # Push constant range: PUSH_SIZE bytes (LightPush struct)
    push_range = ffi.new("VkPushConstantRange*")
    push_range.stageFlags = vk.VK_SHADER_STAGE_VERTEX_BIT | vk.VK_SHADER_STAGE_FRAGMENT_BIT
    push_range.offset = 0
    push_range.size = PUSH_SIZE

    set_layouts = ffi.new("VkDescriptorSetLayout[2]", [grad_set_layout, shadow_set_layout])
    layout_ci = ffi.new("VkPipelineLayoutCreateInfo*")
    layout_ci.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO
    layout_ci.setLayoutCount = 2
    layout_ci.pSetLayouts = set_layouts
    layout_ci.pushConstantRangeCount = 1
    layout_ci.pPushConstantRanges = push_range

    layout_out = ffi.new("VkPipelineLayout*")
    result = vk._vulkan._callApi(
        vk._vulkan.lib.vkCreatePipelineLayout,
        device,
        layout_ci,
        ffi.NULL,
        layout_out,
    )
    if result != vk.VK_SUCCESS:
        raise RuntimeError(f"vkCreatePipelineLayout failed: {result}")
    pipeline_layout = layout_out[0]

    pi = ffi.new("VkGraphicsPipelineCreateInfo*")
    pi.sType = vk.VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO

    # Shader stages
    stages = ffi.new("VkPipelineShaderStageCreateInfo[2]")
    main_name = ffi.new("char[]", b"main")
    stages[0].sType = vk.VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO
    stages[0].stage = vk.VK_SHADER_STAGE_VERTEX_BIT
    stages[0].module = vert_module
    stages[0].pName = main_name
    stages[1].sType = vk.VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO
    stages[1].stage = vk.VK_SHADER_STAGE_FRAGMENT_BIT
    stages[1].module = frag_module
    stages[1].pName = main_name
    pi.stageCount = 2
    pi.pStages = stages

    # No vertex input (positions generated in vertex shader from gl_VertexIndex)
    vi = ffi.new("VkPipelineVertexInputStateCreateInfo*")
    vi.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO
    pi.pVertexInputState = vi

    # Input assembly: TRIANGLE_LIST (6 verts per quad)
    ia = ffi.new("VkPipelineInputAssemblyStateCreateInfo*")
    ia.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO
    ia.topology = vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST
    pi.pInputAssemblyState = ia

    # Viewport state
    vps = ffi.new("VkPipelineViewportStateCreateInfo*")
    vps.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO
    vps.viewportCount = 1
    viewport = ffi.new("VkViewport*")
    viewport.width = float(extent[0])
    viewport.height = float(extent[1])
    viewport.maxDepth = 1.0
    vps.pViewports = viewport
    scissor = ffi.new("VkRect2D*")
    scissor.extent.width = extent[0]
    scissor.extent.height = extent[1]
    vps.scissorCount = 1
    vps.pScissors = scissor
    pi.pViewportState = vps

    # Rasterization
    rs = ffi.new("VkPipelineRasterizationStateCreateInfo*")
    rs.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO
    rs.polygonMode = vk.VK_POLYGON_MODE_FILL
    rs.lineWidth = 1.0
    rs.cullMode = vk.VK_CULL_MODE_NONE
    pi.pRasterizationState = rs

    # Multisample
    ms = ffi.new("VkPipelineMultisampleStateCreateInfo*")
    ms.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO
    ms.rasterizationSamples = vk.VK_SAMPLE_COUNT_1_BIT
    pi.pMultisampleState = ms

    # No depth test
    dss = ffi.new("VkPipelineDepthStencilStateCreateInfo*")
    dss.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO
    dss.depthTestEnable = 0
    dss.depthWriteEnable = 0
    pi.pDepthStencilState = dss

    # Additive blending: src + dst
    cba = ffi.new("VkPipelineColorBlendAttachmentState*")
    cba.blendEnable = 1
    cba.srcColorBlendFactor = vk.VK_BLEND_FACTOR_ONE
    cba.dstColorBlendFactor = vk.VK_BLEND_FACTOR_ONE
    cba.colorBlendOp = vk.VK_BLEND_OP_ADD
    cba.srcAlphaBlendFactor = vk.VK_BLEND_FACTOR_ONE
    cba.dstAlphaBlendFactor = vk.VK_BLEND_FACTOR_ONE
    cba.alphaBlendOp = vk.VK_BLEND_OP_ADD
    cba.colorWriteMask = (
        vk.VK_COLOR_COMPONENT_R_BIT
        | vk.VK_COLOR_COMPONENT_G_BIT
        | vk.VK_COLOR_COMPONENT_B_BIT
        | vk.VK_COLOR_COMPONENT_A_BIT
    )
    cb = ffi.new("VkPipelineColorBlendStateCreateInfo*")
    cb.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO
    cb.attachmentCount = 1
    cb.pAttachments = cba
    pi.pColorBlendState = cb

    # Dynamic state
    dyn_states = ffi.new(
        "VkDynamicState[2]",
        [
            vk.VK_DYNAMIC_STATE_VIEWPORT,
            vk.VK_DYNAMIC_STATE_SCISSOR,
        ],
    )
    ds = ffi.new("VkPipelineDynamicStateCreateInfo*")
    ds.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO
    ds.dynamicStateCount = 2
    ds.pDynamicStates = dyn_states
    pi.pDynamicState = ds

    pi.layout = pipeline_layout
    pi.renderPass = render_pass

    pipeline_out = ffi.new("VkPipeline*")
    result = vk._vulkan._callApi(
        vk._vulkan.lib.vkCreateGraphicsPipelines,
        device,
        pipeline_cache_for(device),
        1,
        pi,
        ffi.NULL,
        pipeline_out,
    )
    if result != vk.VK_SUCCESS:
        raise RuntimeError(f"vkCreateGraphicsPipelines failed: {result}")

    log.debug("Light2D pipeline created (additive blend)")
    return pipeline_out[0], pipeline_layout

def _create_occluder_pipeline(
    device: Any, vert_module: Any, frag_module: Any, render_pass: Any,
) -> tuple[Any, Any]:
    """S1 occluder-coverage pipeline: vec2 pixel-space verts -> R8 mask (overwrite)."""
    spec = PipelineSpec(
        name="light2d_occluder",
        topology=vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
        vertex_stride=8,   # vec2 float32
        vertex_attrs=((0, vk.VK_FORMAT_R32G32_SFLOAT, 0),),
        cull_mode=vk.VK_CULL_MODE_NONE,
        depth_test=False, depth_write=False,
        blend="opaque",   # overwrite coverage, no blending
        # Range covers VERT|FRAG (default) so engine.push_constants' fixed
        # stage flags match, though only the vertex shader reads it.
        push_size=OCC_PUSH_SIZE,
    )
    return build_pipeline(device, spec, render_pass, (SHADOW_MAP_RESOLUTION, SHADOW_MAP_RESOLUTION),
                          vert_module=vert_module, frag_module=frag_module)

def _create_shadowmap_pipeline(
    device: Any, vert_module: Any, frag_module: Any, render_pass: Any, occ_set_layout: Any,
) -> tuple[Any, Any]:
    """S2 shadow-map build pipeline: shader-generated quad -> R16F atlas row.

    Samples the occ mask (set 0). Viewport/scissor (dynamic) select the row."""
    spec = PipelineSpec(
        name="light2d_shadowmap",
        topology=vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
        cull_mode=vk.VK_CULL_MODE_NONE,
        depth_test=False, depth_write=False,
        blend="opaque",
        set_layouts=(occ_set_layout,),
        # Range covers VERT|FRAG (default) so engine.push_constants' fixed stage
        # flags match, though only the fragment shader reads it.
        push_size=SM_PUSH_SIZE,
    )
    return build_pipeline(device, spec, render_pass, (SHADOW_MAP_RESOLUTION, MAX_SHADOW_LIGHTS),
                          vert_module=vert_module, frag_module=frag_module)