Source code for simvx.graphics.renderer.taa_pass

"""Temporal anti-aliasing resolve pass (desktop Vulkan).

Full-screen HDR pass that runs AFTER the forward pass (and volumetric fog) and
BEFORE tonemap. Each frame it:

  * samples the current (jittered) HDR colour + depth,
  * reconstructs the per-pixel CAMERA motion vector from depth + the previous
    view-projection (depth -> world -> prev-clip -> prev_uv), covering camera +
    static-geometry motion,
  * reprojects the previous resolved frame (history) through that motion vector,
  * applies a 3x3 YCoCg neighborhood AABB clamp to the reprojected history
    (Karis/Lottes), falling back to the current colour where the reprojection
    lands offscreen (disocclusion),
  * blends current vs clamped-history (~0.9 history weight),
  * writes the resolved HDR into one of two ping-pong history targets.

The renderer re-points the tonemap HDR input at this pass's resolved output (the
same descriptor-swap mechanism volumetric fog uses) ONCE, when TAA is toggled.
Two persistent R16G16B16A16_SFLOAT targets: a fixed ``_output`` target the
tonemap always samples (so its descriptor never flips), and a ``_history``
target sampled as last frame's resolved result. After each resolve the output is
blitted into the history target for next frame -- this keeps the tonemap binding
stable (no per-frame descriptor rewrite / device-wait stall) while still
accumulating temporally.

Matches the web ``taa.wgsl`` resolve math (YCoCg clamp + blend) so both backends
behave alike; the velocity here is reconstructed from depth rather than sampled
from a precomputed velocity texture.

SCOPE: camera + static-geometry motion only. Skinned meshes / GPU particles
have no prev-frame joint/particle state at this boundary, so they reproject as
static (mild self-motion ghosting; camera motion still resolves). Per-object
velocity (MRT + prev-transform SSBO + prev-joint buffer) is a flagged follow-up.
"""

import logging
from typing import Any

import numpy as np
import vulkan as vk

from ..gpu.descriptors import (
    DescriptorWriteBatch,
    allocate_descriptor_set,
    create_descriptor_set_layout,
)
from ..gpu.memory import create_buffer, create_sampler, upload_image_data, upload_numpy
from ..gpu.pipeline import PipelineSpec, build_pipeline, create_shader_module
from ..materials.shader_compiler import compile_shader
from .render_target import RenderTarget

__all__ = ["TAAPass"]

log = logging.getLogger(__name__)

# Uniform layout (see taa.frag ``Uniforms``):
#   mat4 inv_vp(64) + mat4 prev_vp(64) + vec4 params(16) + vec4 params2(16) = 160.
# params  = (blend_factor, feedback_only, viewport_inv.x, viewport_inv.y)
# params2 = (has_velocity, reserved, reserved, reserved)
_UBO_SIZE = 160

# RG16F per-object velocity buffer format (must match VelocityPass). The sentinel
# clear value (< -1.5) marks "no per-object velocity here -> depth fallback".
_VELOCITY_FORMAT = vk.VK_FORMAT_R16G16_SFLOAT
_VELOCITY_SENTINEL = -2.0

# History blend weight: fraction taken from clamped history each frame. 0.9 is
# the classic TAA accumulation weight (matches the web default).
_DEFAULT_BLEND = 0.9


[docs] class TAAPass: """Temporal AA resolve over the post-forward HDR target.""" def __init__(self, engine: Any) -> None: self._engine = engine self._ready = False self.enabled = False # History blend weight (history feedback). Driven from WorldEnvironment # only via the enable toggle; exposed here for tuning. self.blend_factor = _DEFAULT_BLEND # When set (e.g. paused scene) the resolve passes the current frame # through unchanged. Off by default. self.feedback_only = False self._width = 0 self._height = 0 self._colour_format = vk.VK_FORMAT_R16G16B16A16_SFLOAT # Per-frame matrices (set by the renderer before render()). self._inv_vp = np.eye(4, dtype=np.float32) self._prev_vp = np.eye(4, dtype=np.float32) # Two persistent HDR targets with fixed roles: # _output : this frame's resolved result. The tonemap ALWAYS samples # this view, so its descriptor never changes per frame. # _history : last frame's resolved result (sampled as binding 1). After # each resolve, _output is blitted into _history for the next # frame -- keeping the tonemap binding stable (no per-frame # descriptor rewrite + device-wait stall) while accumulating. # ``_has_history`` is False until the first resolve has populated history, # so frame 0 blends toward current (no stale data). self._output: RenderTarget | None = None self._history: RenderTarget | None = None self._has_history = False # The current-frame HDR colour the resolve samples (binding 0). The # renderer sets this each frame (it is the fog output when fog is active, # else the raw HDR target colour). self._cur_colour_view: Any = None self._depth_view: Any = None # Per-object velocity buffer (binding 4). The renderer points this at the # VelocityPass's RG16F view each TAA frame once that pass exists; until # then binding 4 samples a 1x1 sentinel-filled dummy so the shader sees # "no per-object velocity" and uses the depth-based camera fallback. The # ``has_velocity`` UBO flag follows whether a real view is bound. self._velocity_view: Any = None self._has_velocity = False self._dummy_vel_image: Any = None self._dummy_vel_mem: Any = None self._dummy_vel_view: Any = None self._sampler: Any = None self._depth_sampler: Any = None self._ubo_buf: Any = None self._ubo_mem: Any = None self._desc_layout: Any = None self._desc_pool: Any = None self._desc_set: Any = None self._pipeline: Any = None self._pipeline_layout: Any = None self._vert_module: Any = None self._frag_module: Any = None
[docs] @property def output_view(self) -> Any: """Stable colour view of the resolved HDR result (tonemap input).""" return self._output.colour_view if self._output else None
[docs] def setup(self, width: int, height: int, cur_colour_view: Any, depth_view: Any, colour_format: int) -> None: """Allocate the two history targets, UBO, descriptors and pipeline. ``cur_colour_view`` is the post-forward HDR colour the resolve samples (binding 0); the renderer updates it per frame via :meth:`set_inputs`. """ e = self._engine device = e.ctx.device self._width, self._height = width, height self._cur_colour_view = cur_colour_view self._depth_view = depth_view self._colour_format = colour_format self._output = self._make_target(width, height, colour_format) self._history = self._make_target(width, height, colour_format) self._sampler = create_sampler(device) self._depth_sampler = create_sampler(device, filter_mode=vk.VK_FILTER_NEAREST) self._make_dummy_velocity(device) # Until the renderer binds a real velocity view, sample the sentinel dummy. self._velocity_view = self._dummy_vel_view self._has_velocity = False self._ubo_buf, self._ubo_mem = create_buffer( device, e.ctx.physical_device, _UBO_SIZE, vk.VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, ) fs = vk.VK_SHADER_STAGE_FRAGMENT_BIT self._desc_layout = create_descriptor_set_layout(device, [ (0, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # current HDR (1, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # history (2, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # depth (3, vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, fs, 1), # uniforms (4, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # velocity ]) pool_sizes = [ vk.VkDescriptorPoolSize(type=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, descriptorCount=4), vk.VkDescriptorPoolSize(type=vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, descriptorCount=1), ] self._desc_pool = vk.vkCreateDescriptorPool(device, vk.VkDescriptorPoolCreateInfo( maxSets=1, poolSizeCount=len(pool_sizes), pPoolSizes=pool_sizes, ), None) self._desc_set = allocate_descriptor_set(device, self._desc_pool, self._desc_layout) self._write_descriptors() shader_dir = e.shader_dir self._vert_module = create_shader_module(device, compile_shader(shader_dir / "taa.vert")) self._frag_module = create_shader_module(device, compile_shader(shader_dir / "taa.frag")) self._create_pipeline(device, self._output.render_pass, (width, height)) self._ready = True self._has_history = False log.debug("TAA pass initialised (%dx%d)", width, height)
def _make_target(self, width: int, height: int, colour_format: int) -> RenderTarget: e = self._engine return RenderTarget( e.ctx.device, e.ctx.physical_device, width, height, colour_format=colour_format, use_depth=False, queue=e.ctx.graphics_queue, command_pool=e.ctx.command_pool, ) def _make_dummy_velocity(self, device: Any) -> None: """Create a 1x1 RG16F texture pre-filled with the velocity sentinel. Binding 4 must always reference a valid view (even before the velocity pass exists / on a frame with no moving meshes). Sampling this dummy yields the sentinel (< -1.5), so the shader falls back to the depth-based camera reprojection. Left in SHADER_READ_ONLY by ``upload_image_data``. """ e = self._engine pixels = np.array([[_VELOCITY_SENTINEL, _VELOCITY_SENTINEL]], dtype=np.float16).reshape(1, 1, 2) self._dummy_vel_image, self._dummy_vel_mem = upload_image_data( device, e.ctx.physical_device, e.ctx.graphics_queue, e.ctx.command_pool, np.ascontiguousarray(pixels), 1, 1, fmt=_VELOCITY_FORMAT, ) self._dummy_vel_view = vk.vkCreateImageView(device, vk.VkImageViewCreateInfo( image=self._dummy_vel_image, viewType=vk.VK_IMAGE_VIEW_TYPE_2D, format=_VELOCITY_FORMAT, subresourceRange=vk.VkImageSubresourceRange( aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1), ), None) def _write_descriptors(self) -> None: """Wire the resolve descriptor set. Binding 0 = current (post-forward+fog) HDR colour, 1 = history (last frame's resolved result), 2 = depth, 3 = uniforms, 4 = per-object velocity (the VelocityPass RG16F view, or the sentinel dummy). """ with DescriptorWriteBatch(self._engine.ctx.device) as batch: ds = self._desc_set batch.image(ds, 0, self._cur_colour_view, self._sampler) batch.image(ds, 1, self._history.colour_view, self._sampler) batch.image(ds, 2, self._depth_view, self._depth_sampler, image_layout=vk.VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL) batch.uniform_buffer(ds, 3, self._ubo_buf, _UBO_SIZE) batch.image(ds, 4, self._velocity_view or self._dummy_vel_view, self._sampler)
[docs] def set_inputs(self, cur_colour_view: Any, depth_view: Any) -> None: """Re-point binding 0 (current HDR) + 2 (depth) at new views. Called when the current-frame HDR source changes (fog toggled on/off, so the resolve samples the fog output instead of the raw HDR colour) or on resize. Cheap: a single batched descriptor update. """ if not self._ready: return if cur_colour_view is self._cur_colour_view and depth_view is self._depth_view: return self._cur_colour_view = cur_colour_view self._depth_view = depth_view self._write_descriptors()
[docs] def set_velocity_view(self, velocity_view: Any) -> None: """Point binding 4 at the per-object velocity buffer (or clear it). Called each TAA frame by the renderer with ``VelocityPass.velocity_view`` (the RG16F per-object motion target). Passing ``None`` reverts to the sentinel dummy (depth fallback everywhere). Only rewrites the descriptor when the view actually changes, so the steady state is a flag-only update. """ if not self._ready: return target = velocity_view or self._dummy_vel_view self._has_velocity = velocity_view is not None if target is self._velocity_view: return self._velocity_view = target self._write_descriptors()
[docs] def set_frame_matrices(self, inv_vp: np.ndarray, prev_vp: np.ndarray) -> None: """Stash the current inverse VP + previous VP for the next ``render``.""" self._inv_vp = np.asarray(inv_vp, dtype=np.float32) self._prev_vp = np.asarray(prev_vp, dtype=np.float32)
def _upload_uniforms(self) -> None: data = np.zeros(_UBO_SIZE // 4, dtype=np.float32) # mat4 inv_vp + mat4 prev_vp: row-major numpy -> column-major GLSL. data[0:16] = self._inv_vp.T.ravel() data[16:32] = self._prev_vp.T.ravel() # On the first frame there is no valid history yet: force feedback_only # so the resolve passes the current frame through (no stale-data smear). feedback = 1.0 if (self.feedback_only or not self._has_history) else 0.0 inv_w = 1.0 / float(self._width) if self._width else 0.0 inv_h = 1.0 / float(self._height) if self._height else 0.0 data[32:36] = [float(self.blend_factor), feedback, inv_w, inv_h] # params2.x = has_velocity: 1 when a real per-object velocity view is # bound, so the shader samples it (and falls back per-pixel on sentinels). data[36] = 1.0 if self._has_velocity else 0.0 upload_numpy(self._engine.ctx.device, self._ubo_mem, data) # -- record --------------------------------------------------------------
[docs] def render(self, cmd: Any) -> None: """Resolve TAA into the fixed output target, then blit it to history. No-op when off. The output target's view is stable (tonemap always reads it), so this pass owns no per-frame descriptor rewrite. After resolve the output is copied into the history target so next frame can reproject it. """ if not self._ready or not self.enabled or not self._pipeline: return self._upload_uniforms() rt = self._output rp_begin = vk.VkRenderPassBeginInfo( renderPass=rt.render_pass, framebuffer=rt.framebuffer, renderArea=vk.VkRect2D(offset=vk.VkOffset2D(x=0, y=0), extent=vk.VkExtent2D(width=rt.width, height=rt.height)), clearValueCount=1, pClearValues=[vk.VkClearValue(color=vk.VkClearColorValue(float32=[0.0, 0.0, 0.0, 1.0]))], ) vk.vkCmdBeginRenderPass(cmd, rp_begin, vk.VK_SUBPASS_CONTENTS_INLINE) vk.vkCmdSetViewport(cmd, 0, 1, [vk.VkViewport( x=0.0, y=0.0, width=float(rt.width), height=float(rt.height), minDepth=0.0, maxDepth=1.0)]) vk.vkCmdSetScissor(cmd, 0, 1, [vk.VkRect2D( offset=vk.VkOffset2D(x=0, y=0), extent=vk.VkExtent2D(width=rt.width, height=rt.height))]) vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._pipeline) vk.vkCmdBindDescriptorSets(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._pipeline_layout, 0, 1, [self._desc_set], 0, None) vk.vkCmdDraw(cmd, 3, 1, 0, 0) vk.vkCmdEndRenderPass(cmd) # Copy the resolved output into the history target so next frame's # reprojection samples it. Both images leave the render pass / start in # SHADER_READ_ONLY_OPTIMAL; transition to transfer layouts, copy, restore. self._blit_output_to_history(cmd) # History is valid from the next frame onward. self._has_history = True
def _blit_output_to_history(self, cmd: Any) -> None: """Copy the resolved output image into the history image (same size/fmt).""" src = self._output.colour_image dst = self._history.colour_image w, h = self._width, self._height def barrier(image, old, new, src_access, dst_access): return vk.VkImageMemoryBarrier( srcAccessMask=src_access, dstAccessMask=dst_access, oldLayout=old, newLayout=new, srcQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED, dstQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED, image=image, subresourceRange=vk.VkImageSubresourceRange( aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1), ) # SHADER_READ_ONLY -> transfer layouts. vk.vkCmdPipelineBarrier( cmd, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, None, 0, None, 2, [ barrier(src, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_READ_BIT), barrier(dst, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_WRITE_BIT), ]) region = vk.VkImageCopy( srcSubresource=vk.VkImageSubresourceLayers( aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0, baseArrayLayer=0, layerCount=1), srcOffset=vk.VkOffset3D(x=0, y=0, z=0), dstSubresource=vk.VkImageSubresourceLayers( aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0, baseArrayLayer=0, layerCount=1), dstOffset=vk.VkOffset3D(x=0, y=0, z=0), extent=vk.VkExtent3D(width=w, height=h, depth=1), ) vk.vkCmdCopyImage( cmd, src, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, [region]) # Restore both to SHADER_READ_ONLY for sampling (tonemap reads output, # next-frame resolve reads history). vk.vkCmdPipelineBarrier( cmd, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, 0, None, 0, None, 2, [ barrier(src, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_ACCESS_TRANSFER_READ_BIT, vk.VK_ACCESS_SHADER_READ_BIT), barrier(dst, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, vk.VK_ACCESS_TRANSFER_WRITE_BIT, vk.VK_ACCESS_SHADER_READ_BIT), ]) # -- pipeline ------------------------------------------------------------ def _create_pipeline(self, device: Any, render_pass: Any, extent: tuple[int, int]) -> None: """Create the fullscreen-triangle resolve pipeline. Declares its fixed-function state via :class:`PipelineSpec` and defers all cffi sub-struct plumbing (and lifetime management) to :func:`build_pipeline`. Geometry is shader-generated (no vertex input), depth test/write are off, opaque colour write, no push constants. The shaders are compiled at runtime, so the pre-created modules are passed directly rather than via SPIR-V paths in the spec. """ spec = PipelineSpec( name="TAA", topology=vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, vertex_stride=0, # fullscreen triangle generated in the shader cull_mode=vk.VK_CULL_MODE_NONE, depth_test=False, depth_write=False, blend="opaque", set_layouts=(self._desc_layout,), ) self._pipeline, self._pipeline_layout = build_pipeline( device, spec, render_pass, extent, vert_module=self._vert_module, frag_module=self._frag_module, ) # -- resize / cleanup ----------------------------------------------------
[docs] def resize(self, width: int, height: int, cur_colour_view: Any, depth_view: Any, colour_format: int) -> None: if not self._ready: return self._width, self._height = width, height self._colour_format = colour_format self._cur_colour_view = cur_colour_view self._depth_view = depth_view if self._output: self._output.destroy() if self._history: self._history.destroy() self._output = self._make_target(width, height, colour_format) self._history = self._make_target(width, height, colour_format) # History is gone after a resize: start fresh so we don't reproject from # a stale-resolution buffer. self._has_history = False # The velocity pass resizes too (its view changes): revert to the dummy # until the renderer re-binds the new velocity view next TAA frame. self._velocity_view = self._dummy_vel_view self._has_velocity = False self._write_descriptors()
[docs] def cleanup(self) -> None: if not self._ready: return device = self._engine.ctx.device if self._pipeline: vk.vkDestroyPipeline(device, self._pipeline, None) if self._pipeline_layout: vk.vkDestroyPipelineLayout(device, self._pipeline_layout, None) if self._vert_module: vk.vkDestroyShaderModule(device, self._vert_module, None) if self._frag_module: vk.vkDestroyShaderModule(device, self._frag_module, None) if self._desc_pool: vk.vkDestroyDescriptorPool(device, self._desc_pool, None) if self._desc_layout: vk.vkDestroyDescriptorSetLayout(device, self._desc_layout, None) if self._sampler: vk.vkDestroySampler(device, self._sampler, None) if self._depth_sampler: vk.vkDestroySampler(device, self._depth_sampler, None) if self._dummy_vel_view: vk.vkDestroyImageView(device, self._dummy_vel_view, None) if self._dummy_vel_image: vk.vkDestroyImage(device, self._dummy_vel_image, None) if self._dummy_vel_mem: vk.vkFreeMemory(device, self._dummy_vel_mem, None) if self._ubo_buf: vk.vkDestroyBuffer(device, self._ubo_buf, None) if self._ubo_mem: vk.vkFreeMemory(device, self._ubo_mem, None) if self._output: self._output.destroy() if self._history: self._history.destroy() self._output = self._history = None self._ready = False