Source code for simvx.graphics.renderer.draw2d_pass

"""2D drawing pass: renders Draw2D geometry using ui.vert + draw2d_fill.frag, plus MSDF text.

Fill fragments are modulated by the ``Light2DPass`` accumulation texture when
any ``PointLight2D`` is present in the scene (multiply blend with an ambient
floor). With no lights submitted, the composite is a bypass and fills render
with their unlit vertex colour.
"""

import logging
from dataclasses import dataclass
from typing import Any

import numpy as np
import vulkan as vk

from ..types import SHADER_DIR
from ..draw2d_vertex import UI_VERTEX_DTYPE
from ..gpu.memory import create_buffer, upload_numpy
from .pass_helpers import create_sampler_descriptor_pool

__all__ = ["Draw2DPass"]

log = logging.getLogger(__name__)

# Pre-allocate buffers: sized for full terminal rendering (100x30 terminal
# with bitmap font can generate ~100K verts for character pixels + backgrounds)
MAX_FILL_VERTS = 131072   # 128K verts (4 MB)
MAX_LINE_VERTS = 32768    # 32K verts (1 MB)
VERTEX_STRIDE = 32        # pos(vec2) + uv(vec2) + colour(vec4)
MAX_FILL_INDICES = 196608  # 192K indices (768 KB)
MAX_TEXT_VERTS = 32768
MAX_TEXT_INDICES = 49152
MAX_TEX_VERTS = 16384
MAX_TEX_INDICES = 24576

# Fill push constant layout: vec2 screen_size + vec2 pad + vec4 ambient + ivec4 flags = 48 bytes
FILL_PUSH_SIZE = 48
# Ambient floor applied to unlit regions when any PointLight2D is submitted.
# Matches the dark-neutral brightness Godot's Light2D defaults to when the
# canvas_modulate ambient is left unspecified.
_DEFAULT_AMBIENT = (0.2, 0.2, 0.2, 1.0)


[docs]
class Draw2DPass:
    """GPU pass that renders 2D fills (triangles), lines, and MSDF text from Draw2D buffers.

    Text rendering shares the TextPass's pipeline, descriptor set, and atlas: only the
    text vertex/index buffers are owned here (needed for per-batch scissor clipping).
    """

    __slots__ = (
        "_engine", "_text_pass", "_light2d_pass",
        "_fill_pipeline", "_fill_pipeline_layout",
        "_fill_desc_layout", "_fill_desc_pool", "_fill_desc_set", "_fill_desc_view",
        "_line_pipeline", "_line_pipeline_layout",
        "_vert_module", "_frag_module", "_line_frag_module",
        "_fill_vb", "_fill_vb_mem", "_fill_ib", "_fill_ib_mem",
        "_line_vb", "_line_vb_mem",
        "_text_vb", "_text_vb_mem", "_text_ib", "_text_ib_mem",
        "_tex_pipeline", "_tex_pipeline_layout", "_tex_frag_module",
        "_tex_vb", "_tex_vb_mem", "_tex_ib", "_tex_ib_mem",
        "_ready", "last_frame_draw_count",
    )

    def __init__(self, engine: Any, text_pass: Any = None, light2d_pass: Any = None):
        for slot in self.__slots__:
            object.__setattr__(self, slot, None)
        self._engine = engine
        self._text_pass = text_pass
        self._light2d_pass = light2d_pass
        self._ready = False
        self.last_frame_draw_count = 0


[docs]
    def setup(self, render_pass: Any = None, extent: tuple[int, int] | None = None) -> None:
        """Create pipelines and allocate GPU buffers.

        Args:
            render_pass: Vulkan render pass to compile pipelines against.
                Defaults to the engine's main (swapchain) render pass.
            extent: Framebuffer extent (width, height). Defaults to engine extent.
        """
        e = self._engine
        device = e.ctx.device
        phys = e.ctx.physical_device
        rp = render_pass or e.render_pass
        ext = extent or e.extent

        # Compile shaders: fill uses draw2d_fill.frag (modulated by Light2D accum),
        # lines use the plain solid-colour frag (UI overlays are not lit).
        from ..gpu.pipeline import create_shader_module
        from ..materials.shader_compiler import compile_shader

        self._vert_module = create_shader_module(device, compile_shader(SHADER_DIR / "ui.vert"))
        self._frag_module = create_shader_module(device, compile_shader(SHADER_DIR / "draw2d_fill.frag"))
        self._line_frag_module = create_shader_module(
            device, compile_shader(SHADER_DIR / "ui_solid.frag"),
        )

        # Fill pipeline uses set 0 = light accumulation sampler2D from
        # Light2DPass. The shader bypasses the texture sample entirely when the
        # has_lights push-constant flag is 0, so we only need a valid view
        # bound to keep Vulkan validation happy; Light2DPass.setup pre-
        # transitions its RT to SHADER_READ_ONLY_OPTIMAL for that reason.
        self._fill_desc_layout = _create_fill_descriptor_layout(device)
        self._fill_desc_pool, desc_sets = create_sampler_descriptor_pool(
            device, self._fill_desc_layout,
        )
        self._fill_desc_set = desc_sets[0]
        if self._light2d_pass is not None:
            view = self._light2d_pass.get_light_texture_view()
            sampler = self._light2d_pass.get_light_sampler()
            _write_fill_descriptor(device, self._fill_desc_set, view, sampler)
            self._fill_desc_view = view

        # Fill pipeline (triangle topology): custom layout with descriptor set + 48B push
        self._fill_pipeline, self._fill_pipeline_layout = _create_fill_pipeline(
            device, self._vert_module, self._frag_module,
            rp, ext, self._fill_desc_layout,
        )

        # Line pipeline (line topology): create via CFFI, shares ui.vert + ui_solid.frag
        self._line_pipeline, self._line_pipeline_layout = _create_line2d_pipeline(
            device, self._vert_module, self._line_frag_module,
            rp, ext,
        )

        # Allocate host-visible buffers
        host_flags = (
            vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT
            | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
        )
        self._fill_vb, self._fill_vb_mem = create_buffer(
            device, phys, MAX_FILL_VERTS * VERTEX_STRIDE,
            vk.VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, host_flags,
        )
        self._fill_ib, self._fill_ib_mem = create_buffer(
            device, phys, MAX_FILL_INDICES * 4,
            vk.VK_BUFFER_USAGE_INDEX_BUFFER_BIT, host_flags,
        )
        self._line_vb, self._line_vb_mem = create_buffer(
            device, phys, MAX_LINE_VERTS * VERTEX_STRIDE,
            vk.VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, host_flags,
        )

        # Text vertex/index buffers (geometry assembled per-batch for scissor clipping)
        self._text_vb, self._text_vb_mem = create_buffer(
            device, phys, MAX_TEXT_VERTS * VERTEX_STRIDE,
            vk.VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, host_flags,
        )
        self._text_ib, self._text_ib_mem = create_buffer(
            device, phys, MAX_TEXT_INDICES * 4,
            vk.VK_BUFFER_USAGE_INDEX_BUFFER_BIT, host_flags,
        )

        # Textured quad pipeline: uses bindless texture descriptor set
        from ..gpu.pipeline import create_shader_module
        from ..materials.shader_compiler import compile_shader
        tex_frag_spv = compile_shader(SHADER_DIR / "ui.frag")
        self._tex_frag_module = create_shader_module(device, tex_frag_spv)
        tex_layout = e.texture_descriptor_layout
        self._tex_pipeline, self._tex_pipeline_layout = _create_textured_ui_pipeline(
            device, self._vert_module, self._tex_frag_module,
            rp, ext, tex_layout,
        )
        self._tex_vb, self._tex_vb_mem = create_buffer(
            device, phys, MAX_TEX_VERTS * VERTEX_STRIDE,
            vk.VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, host_flags,
        )
        self._tex_ib, self._tex_ib_mem = create_buffer(
            device, phys, MAX_TEX_INDICES * 4,
            vk.VK_BUFFER_USAGE_INDEX_BUFFER_BIT, host_flags,
        )

        self._ready = True



[docs]
    def render(self, cmd: Any, width: int, height: int,
               ui_width: int = 0, ui_height: int = 0,
               ops: list | None = None) -> None:
        """Render 2D geometry in submission order.

        Walks ``Draw2D._ops`` (or the provided ``ops`` list, e.g. an isolated
        play-mode tree), coalesces adjacent same-(kind, clip, tex_id) ops into
        one GPU draw, and emits draws in submission order, so the order in
        which ``draw_rect`` / ``draw_text`` / ``draw_texture`` calls happen in
        ``on_draw`` is the order they hit the framebuffer.

        Args:
            ops: Pre-extracted op list. If None, pulls from Draw2D singleton.
        """
        if not self._ready:
            self.last_frame_draw_count = 0
            return

        from ..draw2d import Draw2D
        from ..draw2d_ops import OpKind

        if ops is None:
            ops = Draw2D._ops

        if not ops:
            self.last_frame_draw_count = 0
            return

        device = self._engine.ctx.device
        # UI coordinates may differ from framebuffer pixels (HiDPI / window vs framebuffer)
        uw = ui_width or width
        uh = ui_height or height
        screen = np.array([uw, uh], dtype=np.float32)

        # Refresh the light-accumulation descriptor if Light2DPass recreated its RT
        # (happens on window resize). Rare, so waitIdle is acceptable here.
        if self._light2d_pass is not None:
            current_view = self._light2d_pass.get_light_texture_view()
            if current_view != self._fill_desc_view:
                vk.vkDeviceWaitIdle(device)
                _write_fill_descriptor(
                    device, self._fill_desc_set,
                    current_view, self._light2d_pass.get_light_sampler(),
                )
                self._fill_desc_view = current_view

        # Build fill push constants: vec2 screen + vec2 pad + vec4 ambient + ivec4(has_lights, 0, 0, 0)
        has_lights = 1 if (
            self._light2d_pass is not None and self._light2d_pass.has_lights
        ) else 0
        fill_push = _build_fill_push(uw, uh, _DEFAULT_AMBIENT, has_lights)

        vk_viewport = vk.VkViewport(
            x=0.0, y=0.0,
            width=float(width), height=float(height),
            minDepth=0.0, maxDepth=1.0,
        )
        full_scissor = vk.VkRect2D(
            offset=vk.VkOffset2D(x=0, y=0),
            extent=vk.VkExtent2D(width=width, height=height),
        )

        # Pass 1: coalesce ops into runs by (kind, clip, tex_id). Each run is
        # one GPU draw. Accumulate raw vertex tuples + integer indices into
        # per-kind staging lists; one numpy conversion happens per kind at
        # upload time (Pass 2): not per op.
        kind_verts: dict[int, list[tuple]] = {k: [] for k in (0, 1, 2, 3)}
        kind_indices: dict[int, list[int]] = {k: [] for k in (0, 1, 2, 3)}
        kind_vert_cursor = [0, 0, 0, 0]
        kind_idx_cursor = [0, 0, 0, 0]

        # (kind, clip, tex_id, vert_start, vert_count, idx_start, idx_count)
        draws: list[tuple[int, tuple | None, int, int, int, int, int]] = []

        sentinel = object()
        run_kind: int | None = None
        run_clip: Any = sentinel
        run_tex = -1
        run_vert_start = 0
        run_idx_start = 0
        run_vert_count = 0
        run_idx_count = 0
        run_local_vert_off = 0

        for op in ops:
            k = int(op.kind)
            same_run = (
                k == run_kind
                and op.clip == run_clip
                and (k != OpKind.TEX or op.tex_id == run_tex)
            )
            if not same_run:
                if run_kind is not None and (run_vert_count or run_idx_count):
                    draws.append((
                        run_kind, run_clip, run_tex,
                        run_vert_start, run_vert_count,
                        run_idx_start, run_idx_count,
                    ))
                run_kind = k
                run_clip = op.clip
                run_tex = op.tex_id
                run_vert_start = kind_vert_cursor[k]
                run_idx_start = kind_idx_cursor[k]
                run_vert_count = 0
                run_idx_count = 0
                run_local_vert_off = 0

            nv = len(op.verts)
            kind_verts[k].extend(op.verts)
            kind_vert_cursor[k] += nv
            run_vert_count += nv

            if op.indices is not None:
                # Indices in the per-op payload are local to the op (start at 0).
                # Within a run we offset them by the run-local vertex count so the
                # concatenated index stream is valid relative to ``run_vert_start``,
                # which becomes vkCmdDrawIndexed's vertexOffset.
                if run_local_vert_off:
                    kind_indices[k].extend(i + run_local_vert_off for i in op.indices)
                else:
                    kind_indices[k].extend(op.indices)
                kind_idx_cursor[k] += len(op.indices)
                run_idx_count += len(op.indices)

            run_local_vert_off += nv

        if run_kind is not None and (run_vert_count or run_idx_count):
            draws.append((
                run_kind, run_clip, run_tex,
                run_vert_start, run_vert_count,
                run_idx_start, run_idx_count,
            ))

        if not draws:
            self.last_frame_draw_count = 0
            return

        # Pass 2: upload each kind's concatenated buffer once (clamped to capacity).
        # The numpy conversion happens here, not per-op.
        def _upload(k: int, vb_mem: Any, ib_mem: Any | None,
                    max_v: int, max_i: int | None) -> None:
            verts = kind_verts[k]
            if not verts:
                return
            arr = np.asarray(verts, dtype=np.float32).reshape(-1, 8)
            if arr.shape[0] > max_v:
                log.warning("Draw2D overflow (kind %d): %d verts (max %d)", k, arr.shape[0], max_v)
                arr = arr[:max_v]
            v = np.empty(arr.shape[0], dtype=UI_VERTEX_DTYPE)
            v["position"] = arr[:, :2]
            v["uv"] = arr[:, 2:4]
            v["colour"] = arr[:, 4:8]
            upload_numpy(device, vb_mem, v)
            if ib_mem is not None and kind_indices[k]:
                i = np.asarray(kind_indices[k], dtype=np.uint32)
                if max_i is not None and len(i) > max_i:
                    i = i[:max_i]
                upload_numpy(device, ib_mem, i)

        _upload(OpKind.FILL, self._fill_vb_mem, self._fill_ib_mem,
                MAX_FILL_VERTS, MAX_FILL_INDICES)
        _upload(OpKind.LINE, self._line_vb_mem, None, MAX_LINE_VERTS, None)
        _upload(OpKind.TEXT, self._text_vb_mem, self._text_ib_mem,
                MAX_TEXT_VERTS, MAX_TEXT_INDICES)
        _upload(OpKind.TEX, self._tex_vb_mem, self._tex_ib_mem,
                MAX_TEX_VERTS, MAX_TEX_INDICES)

        # Pass 3: issue draws in submission order. Rebind pipeline only when
        # the kind transitions; reset scissor only when the clip transitions;
        # update tex_id push-constant only when the bound texture changes.
        clip_sx = width / uw if uw > 0 else 1.0
        clip_sy = height / uh if uh > 0 else 1.0

        last_kind = -1
        last_clip: Any = sentinel
        tex_desc = self._engine.texture_descriptor_set if self._tex_pipeline else None
        draw_count = 0
        for kind, clip, tex_id, vert_off, vert_count, idx_off, idx_count in draws:
            # Skip text runs when no atlas is ready (mirrors prior behaviour)
            if kind == OpKind.TEXT and not (self._text_pass and self._text_pass.atlas_version > 0):
                continue
            if kind == OpKind.TEX and not tex_desc:
                continue

            # Pipeline bind on kind transition
            if kind != last_kind:
                if kind == OpKind.FILL:
                    vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._fill_pipeline)
                    vk.vkCmdBindDescriptorSets(
                        cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._fill_pipeline_layout,
                        0, 1, [self._fill_desc_set], 0, None,
                    )
                    self._engine.push_constants(cmd, self._fill_pipeline_layout, fill_push)
                    vk.vkCmdBindVertexBuffers(cmd, 0, 1, [self._fill_vb], [0])
                    vk.vkCmdBindIndexBuffer(cmd, self._fill_ib, 0, vk.VK_INDEX_TYPE_UINT32)
                elif kind == OpKind.LINE:
                    vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._line_pipeline)
                    self._engine.push_constants(cmd, self._line_pipeline_layout, screen.tobytes())
                    vk.vkCmdBindVertexBuffers(cmd, 0, 1, [self._line_vb], [0])
                elif kind == OpKind.TEXT:
                    tp = self._text_pass
                    vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, tp.pipeline)
                    vk.vkCmdBindDescriptorSets(
                        cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, tp.pipeline_layout,
                        0, 1, [tp.descriptor_set], 0, None,
                    )
                    text_pc = np.array([uw, uh, tp.px_range], dtype=np.float32)
                    self._engine.push_constants(cmd, tp.pipeline_layout, text_pc.tobytes())
                    vk.vkCmdBindVertexBuffers(cmd, 0, 1, [self._text_vb], [0])
                    vk.vkCmdBindIndexBuffer(cmd, self._text_ib, 0, vk.VK_INDEX_TYPE_UINT32)
                elif kind == OpKind.TEX:
                    vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._tex_pipeline)
                    vk.vkCmdBindDescriptorSets(
                        cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._tex_pipeline_layout,
                        0, 1, [tex_desc], 0, None,
                    )
                    vk.vkCmdBindVertexBuffers(cmd, 0, 1, [self._tex_vb], [0])
                    vk.vkCmdBindIndexBuffer(cmd, self._tex_ib, 0, vk.VK_INDEX_TYPE_UINT32)
                # Viewport must be set after every pipeline bind (dynamic state)
                vk.vkCmdSetViewport(cmd, 0, 1, [vk_viewport])
                last_clip = sentinel  # force scissor refresh after pipeline bind
                last_kind = kind

            # Scissor on clip transition
            if clip != last_clip:
                if clip is not None:
                    scissor = vk.VkRect2D(
                        offset=vk.VkOffset2D(
                            x=int(clip[0] * clip_sx), y=int(clip[1] * clip_sy),
                        ),
                        extent=vk.VkExtent2D(
                            width=int(clip[2] * clip_sx), height=int(clip[3] * clip_sy),
                        ),
                    )
                else:
                    scissor = full_scissor
                vk.vkCmdSetScissor(cmd, 0, 1, [scissor])
                last_clip = clip

            # Per-texture push constants for TEX (tex_id changes within a TEX
            # pipeline binding because the run boundary already accounts for
            # tex_id transitions: each TEX draw has one tex_id).
            if kind == OpKind.TEX:
                tex_pc = np.array([uw, uh], dtype=np.float32).tobytes() \
                    + np.array([tex_id], dtype=np.int32).tobytes()
                self._engine.push_constants(cmd, self._tex_pipeline_layout, tex_pc)

            # Issue draw
            if kind == OpKind.LINE:
                vk.vkCmdDraw(cmd, vert_count, 1, vert_off, 0)
            else:
                vk.vkCmdDrawIndexed(cmd, idx_count, 1, idx_off, vert_off, 0)
            draw_count += 1

        self.last_frame_draw_count = draw_count

        # Draw2D._reset() is called at start of frame in app.py, before tree.render()


[docs]
    def cleanup(self) -> None:
        if not self._ready:
            return
        device = self._engine.ctx.device
        for obj, fn in [
            (self._fill_pipeline, vk.vkDestroyPipeline),
            (self._fill_pipeline_layout, vk.vkDestroyPipelineLayout),
            (self._line_pipeline, vk.vkDestroyPipeline),
            (self._line_pipeline_layout, vk.vkDestroyPipelineLayout),
            (self._tex_pipeline, vk.vkDestroyPipeline),
            (self._tex_pipeline_layout, vk.vkDestroyPipelineLayout),
            (self._vert_module, vk.vkDestroyShaderModule),
            (self._frag_module, vk.vkDestroyShaderModule),
            (self._line_frag_module, vk.vkDestroyShaderModule),
            (self._tex_frag_module, vk.vkDestroyShaderModule),
            (self._fill_vb, vk.vkDestroyBuffer),
            (self._fill_ib, vk.vkDestroyBuffer),
            (self._line_vb, vk.vkDestroyBuffer),
            (self._text_vb, vk.vkDestroyBuffer),
            (self._text_ib, vk.vkDestroyBuffer),
            (self._tex_vb, vk.vkDestroyBuffer),
            (self._tex_ib, vk.vkDestroyBuffer),
            (self._fill_desc_pool, vk.vkDestroyDescriptorPool),
            (self._fill_desc_layout, vk.vkDestroyDescriptorSetLayout),
        ]:
            if obj:
                fn(device, obj, None)
        for mem in [
            self._fill_vb_mem, self._fill_ib_mem, self._line_vb_mem,
            self._text_vb_mem, self._text_ib_mem,
            self._tex_vb_mem, self._tex_ib_mem,
        ]:
            if mem:
                vk.vkFreeMemory(device, mem, None)
        self._ready = False



@dataclass(frozen=True)
class _Draw2DPipelineSpec:
    """Differences between the four Draw2D pipelines.

    Everything not listed here is shared across all four pipelines: vertex
    input (UI_VERTEX_DTYPE), alpha blending (src-alpha, one-minus-src-alpha
    on colour and src-alpha-respecting on the alpha channel), no depth, no
    culling, dynamic viewport + scissor, single colour attachment.

    Changing any of those defaults is a one-line edit to ``_build_draw2d_
    pipeline``, not a four-site search-and-replace.
    """
    name: str
    topology: int                  # VK_PRIMITIVE_TOPOLOGY_*
    push_size: int                 # bytes; 0 = no push constant range
    set_layouts: tuple = ()        # VkDescriptorSetLayout handles


def _build_draw2d_pipeline(
    device: Any,
    vert_module: Any,
    frag_module: Any,
    render_pass: Any,
    extent: tuple[int, int],
    spec: _Draw2DPipelineSpec,
) -> tuple[Any, Any]:
    """Create a Draw2D pipeline from a *spec*.

    Centralises the ~250 LOC of identical VkGraphicsPipelineCreateInfo
    plumbing that used to live in three near-identical helpers. The only
    things that vary across the four Draw2D pipelines are encoded in the
    spec: see ``_Draw2DPipelineSpec`` for the list.
    """
    ffi = vk.ffi

    # ---- Pipeline layout ------------------------------------------------
    layout_ci = ffi.new("VkPipelineLayoutCreateInfo*")
    layout_ci.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO

    set_layouts_arr = None
    if spec.set_layouts:
        set_layouts_arr = ffi.new(f"VkDescriptorSetLayout[{len(spec.set_layouts)}]",
                                  list(spec.set_layouts))
        layout_ci.setLayoutCount = len(spec.set_layouts)
        layout_ci.pSetLayouts = set_layouts_arr

    push_range = None
    if spec.push_size > 0:
        push_range = ffi.new("VkPushConstantRange*")
        push_range.stageFlags = vk.VK_SHADER_STAGE_VERTEX_BIT | vk.VK_SHADER_STAGE_FRAGMENT_BIT
        push_range.offset = 0
        push_range.size = spec.push_size
        layout_ci.pushConstantRangeCount = 1
        layout_ci.pPushConstantRanges = push_range

    layout_out = ffi.new("VkPipelineLayout*")
    result = vk._vulkan._callApi(
        vk._vulkan.lib.vkCreatePipelineLayout, device, layout_ci, ffi.NULL, layout_out,
    )
    if result != vk.VK_SUCCESS:
        raise RuntimeError(f"vkCreatePipelineLayout({spec.name}) failed: {result}")
    pipeline_layout = layout_out[0]

    # ---- Pipeline create info ------------------------------------------
    pi = ffi.new("VkGraphicsPipelineCreateInfo*")
    pi.sType = vk.VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO

    # Shader stages (vert + frag, both entry point "main")
    stages = ffi.new("VkPipelineShaderStageCreateInfo[2]")
    main_name = ffi.new("char[]", b"main")
    stages[0].sType = vk.VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO
    stages[0].stage = vk.VK_SHADER_STAGE_VERTEX_BIT
    stages[0].module = vert_module
    stages[0].pName = main_name
    stages[1].sType = vk.VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO
    stages[1].stage = vk.VK_SHADER_STAGE_FRAGMENT_BIT
    stages[1].module = frag_module
    stages[1].pName = main_name
    pi.stageCount = 2
    pi.pStages = stages

    # Vertex input: pos(vec2) + uv(vec2) + colour(vec4) = 32 bytes
    binding_desc = ffi.new("VkVertexInputBindingDescription*")
    binding_desc.binding = 0
    binding_desc.stride = VERTEX_STRIDE
    binding_desc.inputRate = vk.VK_VERTEX_INPUT_RATE_VERTEX
    attr_descs = ffi.new("VkVertexInputAttributeDescription[3]")
    for i, (loc, fmt, off) in enumerate((
        (0, vk.VK_FORMAT_R32G32_SFLOAT, 0),
        (1, vk.VK_FORMAT_R32G32_SFLOAT, 8),
        (2, vk.VK_FORMAT_R32G32B32A32_SFLOAT, 16),
    )):
        attr_descs[i].location = loc
        attr_descs[i].binding = 0
        attr_descs[i].format = fmt
        attr_descs[i].offset = off
    vi = ffi.new("VkPipelineVertexInputStateCreateInfo*")
    vi.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO
    vi.vertexBindingDescriptionCount = 1
    vi.pVertexBindingDescriptions = binding_desc
    vi.vertexAttributeDescriptionCount = 3
    vi.pVertexAttributeDescriptions = attr_descs
    pi.pVertexInputState = vi

    # Input assembly
    ia = ffi.new("VkPipelineInputAssemblyStateCreateInfo*")
    ia.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO
    ia.topology = spec.topology
    pi.pInputAssemblyState = ia

    # Viewport (dynamic, but a dummy is still required at create time)
    vps = ffi.new("VkPipelineViewportStateCreateInfo*")
    vps.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO
    vps.viewportCount = 1
    viewport = ffi.new("VkViewport*")
    viewport.width = float(extent[0])
    viewport.height = float(extent[1])
    viewport.maxDepth = 1.0
    vps.pViewports = viewport
    scissor = ffi.new("VkRect2D*")
    scissor.extent.width = extent[0]
    scissor.extent.height = extent[1]
    vps.scissorCount = 1
    vps.pScissors = scissor
    pi.pViewportState = vps

    # Rasterisation: fill, no cull, lineWidth fixed at 1 (thicker lines
    # ride the FILL pipeline via Draw2D._emit_rect_outline / draw_line).
    rs = ffi.new("VkPipelineRasterizationStateCreateInfo*")
    rs.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO
    rs.polygonMode = vk.VK_POLYGON_MODE_FILL
    rs.lineWidth = 1.0
    rs.cullMode = vk.VK_CULL_MODE_NONE
    pi.pRasterizationState = rs

    ms = ffi.new("VkPipelineMultisampleStateCreateInfo*")
    ms.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO
    ms.rasterizationSamples = vk.VK_SAMPLE_COUNT_1_BIT
    pi.pMultisampleState = ms

    dss = ffi.new("VkPipelineDepthStencilStateCreateInfo*")
    dss.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO
    dss.depthTestEnable = 0
    dss.depthWriteEnable = 0
    pi.pDepthStencilState = dss

    # Alpha blend (src-alpha, one-minus-src-alpha for colour;
    # one × dst, one-minus-src-alpha for alpha: preserves framebuffer alpha)
    cba = ffi.new("VkPipelineColorBlendAttachmentState*")
    cba.blendEnable = 1
    cba.srcColorBlendFactor = vk.VK_BLEND_FACTOR_SRC_ALPHA
    cba.dstColorBlendFactor = vk.VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA
    cba.colorBlendOp = vk.VK_BLEND_OP_ADD
    cba.srcAlphaBlendFactor = vk.VK_BLEND_FACTOR_ONE
    cba.dstAlphaBlendFactor = vk.VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA
    cba.alphaBlendOp = vk.VK_BLEND_OP_ADD
    cba.colorWriteMask = (
        vk.VK_COLOR_COMPONENT_R_BIT | vk.VK_COLOR_COMPONENT_G_BIT
        | vk.VK_COLOR_COMPONENT_B_BIT | vk.VK_COLOR_COMPONENT_A_BIT
    )
    cb = ffi.new("VkPipelineColorBlendStateCreateInfo*")
    cb.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO
    cb.attachmentCount = 1
    cb.pAttachments = cba
    pi.pColorBlendState = cb

    # Dynamic state: viewport and scissor are set per-draw
    dyn_states = ffi.new("VkDynamicState[2]", [
        vk.VK_DYNAMIC_STATE_VIEWPORT, vk.VK_DYNAMIC_STATE_SCISSOR,
    ])
    ds = ffi.new("VkPipelineDynamicStateCreateInfo*")
    ds.sType = vk.VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO
    ds.dynamicStateCount = 2
    ds.pDynamicStates = dyn_states
    pi.pDynamicState = ds

    pi.layout = pipeline_layout
    pi.renderPass = render_pass

    pipeline_out = ffi.new("VkPipeline*")
    result = vk._vulkan._callApi(
        vk._vulkan.lib.vkCreateGraphicsPipelines,
        device, ffi.NULL, 1, pi, ffi.NULL, pipeline_out,
    )
    if result != vk.VK_SUCCESS:
        raise RuntimeError(f"vkCreateGraphicsPipelines({spec.name}) failed: {result}")

    log.debug("Draw2D %s pipeline created", spec.name)
    return pipeline_out[0], pipeline_layout


def _create_line2d_pipeline(device, vert_module, frag_module, render_pass, extent):
    """LINE_LIST, no descriptor set, 8-byte push (vec2 screen_size)."""
    return _build_draw2d_pipeline(
        device, vert_module, frag_module, render_pass, extent,
        _Draw2DPipelineSpec(
            name="line",
            topology=vk.VK_PRIMITIVE_TOPOLOGY_LINE_LIST,
            push_size=8,
        ),
    )


def _create_textured_ui_pipeline(
    device, vert_module, frag_module, render_pass, extent, tex_descriptor_layout,
):
    """TRIANGLE_LIST, bindless sampler2D[] at set 0, 12-byte push (vec2+int)."""
    return _build_draw2d_pipeline(
        device, vert_module, frag_module, render_pass, extent,
        _Draw2DPipelineSpec(
            name="textured",
            topology=vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
            push_size=12,  # vec2 screen_size + int texture_id
            set_layouts=(tex_descriptor_layout,),
        ),
    )

def _create_fill_descriptor_layout(device: Any) -> Any:
    """Set 0 = single fragment-stage sampler2D (the Light2DPass accumulation RT)."""
    binding = vk.VkDescriptorSetLayoutBinding(
        binding=0,
        descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
        descriptorCount=1,
        stageFlags=vk.VK_SHADER_STAGE_FRAGMENT_BIT,
    )
    return vk.vkCreateDescriptorSetLayout(device, vk.VkDescriptorSetLayoutCreateInfo(
        bindingCount=1, pBindings=[binding],
    ), None)

def _write_fill_descriptor(device: Any, desc_set: Any, view: Any, sampler: Any) -> None:
    """Point the fill descriptor set at the given sampler+view (both must be non-null)."""
    image_info = vk.VkDescriptorImageInfo(
        sampler=sampler, imageView=view,
        imageLayout=vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
    )
    vk.vkUpdateDescriptorSets(device, 1, [vk.VkWriteDescriptorSet(
        dstSet=desc_set,
        dstBinding=0,
        dstArrayElement=0,
        descriptorCount=1,
        descriptorType=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
        pImageInfo=[image_info],
    )], 0, None)

def _build_fill_push(
    screen_w: float, screen_h: float,
    ambient: tuple[float, float, float, float], has_lights: int,
) -> bytes:
    """Pack the draw2d_fill.frag push-constant block (48 bytes).

    Layout matches Draw2DPush in draw2d_fill.frag:
      vec2 screen_size (8) + vec2 _pad (8) + vec4 ambient (16) + ivec4 flags (16)
    """
    header = np.array([screen_w, screen_h, 0.0, 0.0, *ambient], dtype=np.float32)
    flags = np.array([has_lights, 0, 0, 0], dtype=np.int32)
    return header.tobytes() + flags.tobytes()

def _create_fill_pipeline(device, vert_module, frag_module, render_pass, extent, desc_layout):
    """TRIANGLE_LIST, light-accum sampler at set 0, 48-byte push (Draw2DPush)."""
    return _build_draw2d_pipeline(
        device, vert_module, frag_module, render_pass, extent,
        _Draw2DPipelineSpec(
            name="fill",
            topology=vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
            push_size=FILL_PUSH_SIZE,
            set_layouts=(desc_layout,),
        ),
    )