Source code for simvx.graphics.renderer.taa_pass

"""Temporal anti-aliasing resolve pass (desktop Vulkan).

Full-screen HDR pass that runs AFTER the forward pass (and volumetric fog) and
BEFORE tonemap. Each frame it:

  * samples the current (jittered) HDR colour + depth,
  * reconstructs the per-pixel CAMERA motion vector from depth + the previous
    view-projection (depth -> world -> prev-clip -> prev_uv), covering camera +
    static-geometry motion,
  * reprojects the previous resolved frame (history) through that motion vector,
  * applies a 3x3 YCoCg neighborhood AABB clamp to the reprojected history
    (Karis/Lottes), falling back to the current colour where the reprojection
    lands offscreen (disocclusion),
  * blends current vs clamped-history (~0.9 history weight),
  * writes the resolved HDR into one of two ping-pong history targets.

The renderer re-points the tonemap HDR input at this pass's resolved output (the
same descriptor-swap mechanism volumetric fog uses) ONCE, when TAA is toggled.
Two persistent R16G16B16A16_SFLOAT targets: a fixed ``_output`` target the
tonemap always samples (so its descriptor never flips), and a ``_history``
target sampled as last frame's resolved result. After each resolve the output is
blitted into the history target for next frame -- this keeps the tonemap binding
stable (no per-frame descriptor rewrite / device-wait stall) while still
accumulating temporally.

Matches the web ``taa.wgsl`` resolve math (YCoCg clamp + blend) so both backends
behave alike; the velocity here is reconstructed from depth rather than sampled
from a precomputed velocity texture.

SCOPE: camera + static-geometry motion only. Skinned meshes / GPU particles
have no prev-frame joint/particle state at this boundary, so they reproject as
static (mild self-motion ghosting; camera motion still resolves). Per-object
velocity (MRT + prev-transform SSBO + prev-joint buffer) is a flagged follow-up.
"""

import logging
from typing import Any

import numpy as np
import vulkan as vk

from ..gpu.descriptors import (
    DescriptorWriteBatch,
    allocate_descriptor_set,
    create_descriptor_set_layout,
)
from ..gpu.memory import create_buffer, create_sampler, upload_image_data, upload_numpy
from ..gpu.pipeline import PipelineSpec, build_pipeline, create_shader_module
from ..materials.shader_compiler import compile_shader
from .render_target import RenderTarget

__all__ = ["TAAPass"]

log = logging.getLogger(__name__)

# Uniform layout (see taa.frag ``Uniforms``):
#   mat4 inv_vp(64) + mat4 prev_vp(64) + vec4 params(16) + vec4 params2(16) = 160.
# params  = (blend_factor, feedback_only, viewport_inv.x, viewport_inv.y)
# params2 = (has_velocity, reserved, reserved, reserved)
_UBO_SIZE = 160

# RG16F per-object velocity buffer format (must match VelocityPass). The sentinel
# clear value (< -1.5) marks "no per-object velocity here -> depth fallback".
_VELOCITY_FORMAT = vk.VK_FORMAT_R16G16_SFLOAT
_VELOCITY_SENTINEL = -2.0

# History blend weight: fraction taken from clamped history each frame. 0.9 is
# the classic TAA accumulation weight (matches the web default).
_DEFAULT_BLEND = 0.9



[docs]
class TAAPass:
    """Temporal AA resolve over the post-forward HDR target."""

    def __init__(self, engine: Any) -> None:
        self._engine = engine
        self._ready = False
        self.enabled = False

        # History blend weight (history feedback). Driven from WorldEnvironment
        # only via the enable toggle; exposed here for tuning.
        self.blend_factor = _DEFAULT_BLEND
        # When set (e.g. paused scene) the resolve passes the current frame
        # through unchanged. Off by default.
        self.feedback_only = False

        self._width = 0
        self._height = 0
        self._colour_format = vk.VK_FORMAT_R16G16B16A16_SFLOAT

        # Per-frame matrices (set by the renderer before render()).
        self._inv_vp = np.eye(4, dtype=np.float32)
        self._prev_vp = np.eye(4, dtype=np.float32)

        # Two persistent HDR targets with fixed roles:
        #   _output  : this frame's resolved result. The tonemap ALWAYS samples
        #              this view, so its descriptor never changes per frame.
        #   _history : last frame's resolved result (sampled as binding 1). After
        #              each resolve, _output is blitted into _history for the next
        #              frame -- keeping the tonemap binding stable (no per-frame
        #              descriptor rewrite + device-wait stall) while accumulating.
        # ``_has_history`` is False until the first resolve has populated history,
        # so frame 0 blends toward current (no stale data).
        self._output: RenderTarget | None = None
        self._history: RenderTarget | None = None
        self._has_history = False

        # The current-frame HDR colour the resolve samples (binding 0). The
        # renderer sets this each frame (it is the fog output when fog is active,
        # else the raw HDR target colour).
        self._cur_colour_view: Any = None
        self._depth_view: Any = None

        # Per-object velocity buffer (binding 4). The renderer points this at the
        # VelocityPass's RG16F view each TAA frame once that pass exists; until
        # then binding 4 samples a 1x1 sentinel-filled dummy so the shader sees
        # "no per-object velocity" and uses the depth-based camera fallback. The
        # ``has_velocity`` UBO flag follows whether a real view is bound.
        self._velocity_view: Any = None
        self._has_velocity = False
        self._dummy_vel_image: Any = None
        self._dummy_vel_mem: Any = None
        self._dummy_vel_view: Any = None

        self._sampler: Any = None
        self._depth_sampler: Any = None
        self._ubo_buf: Any = None
        self._ubo_mem: Any = None

        self._desc_layout: Any = None
        self._desc_pool: Any = None
        self._desc_set: Any = None
        self._pipeline: Any = None
        self._pipeline_layout: Any = None
        self._vert_module: Any = None
        self._frag_module: Any = None


[docs]
    @property
    def output_view(self) -> Any:
        """Stable colour view of the resolved HDR result (tonemap input)."""
        return self._output.colour_view if self._output else None



[docs]
    def setup(self, width: int, height: int, cur_colour_view: Any, depth_view: Any,
              colour_format: int) -> None:
        """Allocate the two history targets, UBO, descriptors and pipeline.

        ``cur_colour_view`` is the post-forward HDR colour the resolve samples
        (binding 0); the renderer updates it per frame via :meth:`set_inputs`.
        """
        e = self._engine
        device = e.ctx.device
        self._width, self._height = width, height
        self._cur_colour_view = cur_colour_view
        self._depth_view = depth_view

        self._colour_format = colour_format
        self._output = self._make_target(width, height, colour_format)
        self._history = self._make_target(width, height, colour_format)

        self._sampler = create_sampler(device)
        self._depth_sampler = create_sampler(device, filter_mode=vk.VK_FILTER_NEAREST)
        self._make_dummy_velocity(device)
        # Until the renderer binds a real velocity view, sample the sentinel dummy.
        self._velocity_view = self._dummy_vel_view
        self._has_velocity = False

        self._ubo_buf, self._ubo_mem = create_buffer(
            device, e.ctx.physical_device, _UBO_SIZE,
            vk.VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
            vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
        )

        fs = vk.VK_SHADER_STAGE_FRAGMENT_BIT
        self._desc_layout = create_descriptor_set_layout(device, [
            (0, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1),  # current HDR
            (1, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1),  # history
            (2, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1),  # depth
            (3, vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, fs, 1),          # uniforms
            (4, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1),  # velocity
        ])
        pool_sizes = [
            vk.VkDescriptorPoolSize(type=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, descriptorCount=4),
            vk.VkDescriptorPoolSize(type=vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, descriptorCount=1),
        ]
        self._desc_pool = vk.vkCreateDescriptorPool(device, vk.VkDescriptorPoolCreateInfo(
            maxSets=1, poolSizeCount=len(pool_sizes), pPoolSizes=pool_sizes,
        ), None)
        self._desc_set = allocate_descriptor_set(device, self._desc_pool, self._desc_layout)
        self._write_descriptors()

        shader_dir = e.shader_dir
        self._vert_module = create_shader_module(device, compile_shader(shader_dir / "taa.vert"))
        self._frag_module = create_shader_module(device, compile_shader(shader_dir / "taa.frag"))
        self._create_pipeline(device, self._output.render_pass, (width, height))

        self._ready = True
        self._has_history = False
        log.debug("TAA pass initialised (%dx%d)", width, height)


    def _make_target(self, width: int, height: int, colour_format: int) -> RenderTarget:
        e = self._engine
        return RenderTarget(
            e.ctx.device, e.ctx.physical_device, width, height,
            colour_format=colour_format, use_depth=False,
            queue=e.ctx.graphics_queue, command_pool=e.ctx.command_pool,
        )

    def _make_dummy_velocity(self, device: Any) -> None:
        """Create a 1x1 RG16F texture pre-filled with the velocity sentinel.

        Binding 4 must always reference a valid view (even before the velocity
        pass exists / on a frame with no moving meshes). Sampling this dummy
        yields the sentinel (< -1.5), so the shader falls back to the depth-based
        camera reprojection. Left in SHADER_READ_ONLY by ``upload_image_data``.
        """
        e = self._engine
        pixels = np.array([[_VELOCITY_SENTINEL, _VELOCITY_SENTINEL]], dtype=np.float16).reshape(1, 1, 2)
        self._dummy_vel_image, self._dummy_vel_mem = upload_image_data(
            device, e.ctx.physical_device, e.ctx.graphics_queue, e.ctx.command_pool,
            np.ascontiguousarray(pixels), 1, 1, fmt=_VELOCITY_FORMAT,
        )
        self._dummy_vel_view = vk.vkCreateImageView(device, vk.VkImageViewCreateInfo(
            image=self._dummy_vel_image, viewType=vk.VK_IMAGE_VIEW_TYPE_2D, format=_VELOCITY_FORMAT,
            subresourceRange=vk.VkImageSubresourceRange(
                aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, baseMipLevel=0, levelCount=1,
                baseArrayLayer=0, layerCount=1),
        ), None)

    def _write_descriptors(self) -> None:
        """Wire the resolve descriptor set.

        Binding 0 = current (post-forward+fog) HDR colour, 1 = history (last
        frame's resolved result), 2 = depth, 3 = uniforms, 4 = per-object
        velocity (the VelocityPass RG16F view, or the sentinel dummy).
        """
        with DescriptorWriteBatch(self._engine.ctx.device) as batch:
            ds = self._desc_set
            batch.image(ds, 0, self._cur_colour_view, self._sampler)
            batch.image(ds, 1, self._history.colour_view, self._sampler)
            batch.image(ds, 2, self._depth_view, self._depth_sampler,
                        image_layout=vk.VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL)
            batch.uniform_buffer(ds, 3, self._ubo_buf, _UBO_SIZE)
            batch.image(ds, 4, self._velocity_view or self._dummy_vel_view, self._sampler)


[docs]
    def set_inputs(self, cur_colour_view: Any, depth_view: Any) -> None:
        """Re-point binding 0 (current HDR) + 2 (depth) at new views.

        Called when the current-frame HDR source changes (fog toggled on/off, so
        the resolve samples the fog output instead of the raw HDR colour) or on
        resize. Cheap: a single batched descriptor update.
        """
        if not self._ready:
            return
        if cur_colour_view is self._cur_colour_view and depth_view is self._depth_view:
            return
        self._cur_colour_view = cur_colour_view
        self._depth_view = depth_view
        self._write_descriptors()



[docs]
    def set_velocity_view(self, velocity_view: Any) -> None:
        """Point binding 4 at the per-object velocity buffer (or clear it).

        Called each TAA frame by the renderer with ``VelocityPass.velocity_view``
        (the RG16F per-object motion target). Passing ``None`` reverts to the
        sentinel dummy (depth fallback everywhere). Only rewrites the descriptor
        when the view actually changes, so the steady state is a flag-only update.
        """
        if not self._ready:
            return
        target = velocity_view or self._dummy_vel_view
        self._has_velocity = velocity_view is not None
        if target is self._velocity_view:
            return
        self._velocity_view = target
        self._write_descriptors()



[docs]
    def set_frame_matrices(self, inv_vp: np.ndarray, prev_vp: np.ndarray) -> None:
        """Stash the current inverse VP + previous VP for the next ``render``."""
        self._inv_vp = np.asarray(inv_vp, dtype=np.float32)
        self._prev_vp = np.asarray(prev_vp, dtype=np.float32)


    def _upload_uniforms(self) -> None:
        data = np.zeros(_UBO_SIZE // 4, dtype=np.float32)
        # mat4 inv_vp + mat4 prev_vp: row-major numpy -> column-major GLSL.
        data[0:16] = self._inv_vp.T.ravel()
        data[16:32] = self._prev_vp.T.ravel()
        # On the first frame there is no valid history yet: force feedback_only
        # so the resolve passes the current frame through (no stale-data smear).
        feedback = 1.0 if (self.feedback_only or not self._has_history) else 0.0
        inv_w = 1.0 / float(self._width) if self._width else 0.0
        inv_h = 1.0 / float(self._height) if self._height else 0.0
        data[32:36] = [float(self.blend_factor), feedback, inv_w, inv_h]
        # params2.x = has_velocity: 1 when a real per-object velocity view is
        # bound, so the shader samples it (and falls back per-pixel on sentinels).
        data[36] = 1.0 if self._has_velocity else 0.0
        upload_numpy(self._engine.ctx.device, self._ubo_mem, data)

    # -- record --------------------------------------------------------------


[docs]
    def render(self, cmd: Any) -> None:
        """Resolve TAA into the fixed output target, then blit it to history.

        No-op when off. The output target's view is stable (tonemap always reads
        it), so this pass owns no per-frame descriptor rewrite. After resolve the
        output is copied into the history target so next frame can reproject it.
        """
        if not self._ready or not self.enabled or not self._pipeline:
            return
        self._upload_uniforms()

        rt = self._output
        rp_begin = vk.VkRenderPassBeginInfo(
            renderPass=rt.render_pass, framebuffer=rt.framebuffer,
            renderArea=vk.VkRect2D(offset=vk.VkOffset2D(x=0, y=0),
                                   extent=vk.VkExtent2D(width=rt.width, height=rt.height)),
            clearValueCount=1,
            pClearValues=[vk.VkClearValue(color=vk.VkClearColorValue(float32=[0.0, 0.0, 0.0, 1.0]))],
        )
        vk.vkCmdBeginRenderPass(cmd, rp_begin, vk.VK_SUBPASS_CONTENTS_INLINE)
        vk.vkCmdSetViewport(cmd, 0, 1, [vk.VkViewport(
            x=0.0, y=0.0, width=float(rt.width), height=float(rt.height), minDepth=0.0, maxDepth=1.0)])
        vk.vkCmdSetScissor(cmd, 0, 1, [vk.VkRect2D(
            offset=vk.VkOffset2D(x=0, y=0), extent=vk.VkExtent2D(width=rt.width, height=rt.height))])
        vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._pipeline)
        vk.vkCmdBindDescriptorSets(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS,
                                   self._pipeline_layout, 0, 1, [self._desc_set], 0, None)
        vk.vkCmdDraw(cmd, 3, 1, 0, 0)
        vk.vkCmdEndRenderPass(cmd)

        # Copy the resolved output into the history target so next frame's
        # reprojection samples it. Both images leave the render pass / start in
        # SHADER_READ_ONLY_OPTIMAL; transition to transfer layouts, copy, restore.
        self._blit_output_to_history(cmd)

        # History is valid from the next frame onward.
        self._has_history = True


    def _blit_output_to_history(self, cmd: Any) -> None:
        """Copy the resolved output image into the history image (same size/fmt)."""
        src = self._output.colour_image
        dst = self._history.colour_image
        w, h = self._width, self._height

        def barrier(image, old, new, src_access, dst_access):
            return vk.VkImageMemoryBarrier(
                srcAccessMask=src_access, dstAccessMask=dst_access,
                oldLayout=old, newLayout=new,
                srcQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
                dstQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
                image=image,
                subresourceRange=vk.VkImageSubresourceRange(
                    aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
                    baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1),
            )

        # SHADER_READ_ONLY -> transfer layouts.
        vk.vkCmdPipelineBarrier(
            cmd, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
            0, None, 0, None, 2, [
                barrier(src, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                        vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                        vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_READ_BIT),
                barrier(dst, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                        vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                        vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_WRITE_BIT),
            ])

        region = vk.VkImageCopy(
            srcSubresource=vk.VkImageSubresourceLayers(
                aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0,
                baseArrayLayer=0, layerCount=1),
            srcOffset=vk.VkOffset3D(x=0, y=0, z=0),
            dstSubresource=vk.VkImageSubresourceLayers(
                aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0,
                baseArrayLayer=0, layerCount=1),
            dstOffset=vk.VkOffset3D(x=0, y=0, z=0),
            extent=vk.VkExtent3D(width=w, height=h, depth=1),
        )
        vk.vkCmdCopyImage(
            cmd, src, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
            dst, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, [region])

        # Restore both to SHADER_READ_ONLY for sampling (tonemap reads output,
        # next-frame resolve reads history).
        vk.vkCmdPipelineBarrier(
            cmd, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0,
            0, None, 0, None, 2, [
                barrier(src, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                        vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                        vk.VK_ACCESS_TRANSFER_READ_BIT, vk.VK_ACCESS_SHADER_READ_BIT),
                barrier(dst, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                        vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                        vk.VK_ACCESS_TRANSFER_WRITE_BIT, vk.VK_ACCESS_SHADER_READ_BIT),
            ])

    # -- pipeline ------------------------------------------------------------

    def _create_pipeline(self, device: Any, render_pass: Any, extent: tuple[int, int]) -> None:
        """Create the fullscreen-triangle resolve pipeline.

        Declares its fixed-function state via :class:`PipelineSpec` and defers
        all cffi sub-struct plumbing (and lifetime management) to
        :func:`build_pipeline`. Geometry is shader-generated (no vertex input),
        depth test/write are off, opaque colour write, no push constants. The
        shaders are compiled at runtime, so the pre-created modules are passed
        directly rather than via SPIR-V paths in the spec.
        """
        spec = PipelineSpec(
            name="TAA",
            topology=vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
            vertex_stride=0,  # fullscreen triangle generated in the shader
            cull_mode=vk.VK_CULL_MODE_NONE,
            depth_test=False,
            depth_write=False,
            blend="opaque",
            set_layouts=(self._desc_layout,),
        )
        self._pipeline, self._pipeline_layout = build_pipeline(
            device, spec, render_pass, extent,
            vert_module=self._vert_module, frag_module=self._frag_module,
        )

    # -- resize / cleanup ----------------------------------------------------


[docs]
    def resize(self, width: int, height: int, cur_colour_view: Any, depth_view: Any,
               colour_format: int) -> None:
        if not self._ready:
            return
        self._width, self._height = width, height
        self._colour_format = colour_format
        self._cur_colour_view = cur_colour_view
        self._depth_view = depth_view
        if self._output:
            self._output.destroy()
        if self._history:
            self._history.destroy()
        self._output = self._make_target(width, height, colour_format)
        self._history = self._make_target(width, height, colour_format)
        # History is gone after a resize: start fresh so we don't reproject from
        # a stale-resolution buffer.
        self._has_history = False
        # The velocity pass resizes too (its view changes): revert to the dummy
        # until the renderer re-binds the new velocity view next TAA frame.
        self._velocity_view = self._dummy_vel_view
        self._has_velocity = False
        self._write_descriptors()



[docs]
    def cleanup(self) -> None:
        if not self._ready:
            return
        device = self._engine.ctx.device
        if self._pipeline:
            vk.vkDestroyPipeline(device, self._pipeline, None)
        if self._pipeline_layout:
            vk.vkDestroyPipelineLayout(device, self._pipeline_layout, None)
        if self._vert_module:
            vk.vkDestroyShaderModule(device, self._vert_module, None)
        if self._frag_module:
            vk.vkDestroyShaderModule(device, self._frag_module, None)
        if self._desc_pool:
            vk.vkDestroyDescriptorPool(device, self._desc_pool, None)
        if self._desc_layout:
            vk.vkDestroyDescriptorSetLayout(device, self._desc_layout, None)
        if self._sampler:
            vk.vkDestroySampler(device, self._sampler, None)
        if self._depth_sampler:
            vk.vkDestroySampler(device, self._depth_sampler, None)
        if self._dummy_vel_view:
            vk.vkDestroyImageView(device, self._dummy_vel_view, None)
        if self._dummy_vel_image:
            vk.vkDestroyImage(device, self._dummy_vel_image, None)
        if self._dummy_vel_mem:
            vk.vkFreeMemory(device, self._dummy_vel_mem, None)
        if self._ubo_buf:
            vk.vkDestroyBuffer(device, self._ubo_buf, None)
        if self._ubo_mem:
            vk.vkFreeMemory(device, self._ubo_mem, None)
        if self._output:
            self._output.destroy()
        if self._history:
            self._history.destroy()
        self._output = self._history = None
        self._ready = False