"""Temporal anti-aliasing resolve pass (desktop Vulkan).
Full-screen HDR pass that runs AFTER the forward pass (and volumetric fog) and
BEFORE tonemap. Each frame it:
* samples the current (jittered) HDR colour + depth,
* reconstructs the per-pixel CAMERA motion vector from depth + the previous
view-projection (depth -> world -> prev-clip -> prev_uv), covering camera +
static-geometry motion,
* reprojects the previous resolved frame (history) through that motion vector,
* applies a 3x3 YCoCg neighborhood AABB clamp to the reprojected history
(Karis/Lottes), falling back to the current colour where the reprojection
lands offscreen (disocclusion),
* blends current vs clamped-history (~0.9 history weight),
* writes the resolved HDR into one of two ping-pong history targets.
The renderer re-points the tonemap HDR input at this pass's resolved output (the
same descriptor-swap mechanism volumetric fog uses) ONCE, when TAA is toggled.
Two persistent R16G16B16A16_SFLOAT targets: a fixed ``_output`` target the
tonemap always samples (so its descriptor never flips), and a ``_history``
target sampled as last frame's resolved result. After each resolve the output is
blitted into the history target for next frame -- this keeps the tonemap binding
stable (no per-frame descriptor rewrite / device-wait stall) while still
accumulating temporally.
Matches the web ``taa.wgsl`` resolve math (YCoCg clamp + blend) so both backends
behave alike; the velocity here is reconstructed from depth rather than sampled
from a precomputed velocity texture.
SCOPE: camera + static-geometry motion only. Skinned meshes / GPU particles
have no prev-frame joint/particle state at this boundary, so they reproject as
static (mild self-motion ghosting; camera motion still resolves). Per-object
velocity (MRT + prev-transform SSBO + prev-joint buffer) is a flagged follow-up.
"""
import logging
from typing import Any
import numpy as np
import vulkan as vk
from ..gpu.descriptors import (
DescriptorWriteBatch,
allocate_descriptor_set,
create_descriptor_set_layout,
)
from ..gpu.memory import create_buffer, create_sampler, upload_image_data, upload_numpy
from ..gpu.pipeline import PipelineSpec, build_pipeline, create_shader_module
from ..materials.shader_compiler import compile_shader
from .render_target import RenderTarget
__all__ = ["TAAPass"]
log = logging.getLogger(__name__)
# Uniform layout (see taa.frag ``Uniforms``):
# mat4 inv_vp(64) + mat4 prev_vp(64) + vec4 params(16) + vec4 params2(16) = 160.
# params = (blend_factor, feedback_only, viewport_inv.x, viewport_inv.y)
# params2 = (has_velocity, reserved, reserved, reserved)
_UBO_SIZE = 160
# RG16F per-object velocity buffer format (must match VelocityPass). The sentinel
# clear value (< -1.5) marks "no per-object velocity here -> depth fallback".
_VELOCITY_FORMAT = vk.VK_FORMAT_R16G16_SFLOAT
_VELOCITY_SENTINEL = -2.0
# History blend weight: fraction taken from clamped history each frame. 0.9 is
# the classic TAA accumulation weight (matches the web default).
_DEFAULT_BLEND = 0.9
[docs]
class TAAPass:
"""Temporal AA resolve over the post-forward HDR target."""
def __init__(self, engine: Any) -> None:
self._engine = engine
self._ready = False
self.enabled = False
# History blend weight (history feedback). Driven from WorldEnvironment
# only via the enable toggle; exposed here for tuning.
self.blend_factor = _DEFAULT_BLEND
# When set (e.g. paused scene) the resolve passes the current frame
# through unchanged. Off by default.
self.feedback_only = False
self._width = 0
self._height = 0
self._colour_format = vk.VK_FORMAT_R16G16B16A16_SFLOAT
# Per-frame matrices (set by the renderer before render()).
self._inv_vp = np.eye(4, dtype=np.float32)
self._prev_vp = np.eye(4, dtype=np.float32)
# Two persistent HDR targets with fixed roles:
# _output : this frame's resolved result. The tonemap ALWAYS samples
# this view, so its descriptor never changes per frame.
# _history : last frame's resolved result (sampled as binding 1). After
# each resolve, _output is blitted into _history for the next
# frame -- keeping the tonemap binding stable (no per-frame
# descriptor rewrite + device-wait stall) while accumulating.
# ``_has_history`` is False until the first resolve has populated history,
# so frame 0 blends toward current (no stale data).
self._output: RenderTarget | None = None
self._history: RenderTarget | None = None
self._has_history = False
# The current-frame HDR colour the resolve samples (binding 0). The
# renderer sets this each frame (it is the fog output when fog is active,
# else the raw HDR target colour).
self._cur_colour_view: Any = None
self._depth_view: Any = None
# Per-object velocity buffer (binding 4). The renderer points this at the
# VelocityPass's RG16F view each TAA frame once that pass exists; until
# then binding 4 samples a 1x1 sentinel-filled dummy so the shader sees
# "no per-object velocity" and uses the depth-based camera fallback. The
# ``has_velocity`` UBO flag follows whether a real view is bound.
self._velocity_view: Any = None
self._has_velocity = False
self._dummy_vel_image: Any = None
self._dummy_vel_mem: Any = None
self._dummy_vel_view: Any = None
self._sampler: Any = None
self._depth_sampler: Any = None
self._ubo_buf: Any = None
self._ubo_mem: Any = None
self._desc_layout: Any = None
self._desc_pool: Any = None
self._desc_set: Any = None
self._pipeline: Any = None
self._pipeline_layout: Any = None
self._vert_module: Any = None
self._frag_module: Any = None
[docs]
@property
def output_view(self) -> Any:
"""Stable colour view of the resolved HDR result (tonemap input)."""
return self._output.colour_view if self._output else None
[docs]
def setup(self, width: int, height: int, cur_colour_view: Any, depth_view: Any,
colour_format: int) -> None:
"""Allocate the two history targets, UBO, descriptors and pipeline.
``cur_colour_view`` is the post-forward HDR colour the resolve samples
(binding 0); the renderer updates it per frame via :meth:`set_inputs`.
"""
e = self._engine
device = e.ctx.device
self._width, self._height = width, height
self._cur_colour_view = cur_colour_view
self._depth_view = depth_view
self._colour_format = colour_format
self._output = self._make_target(width, height, colour_format)
self._history = self._make_target(width, height, colour_format)
self._sampler = create_sampler(device)
self._depth_sampler = create_sampler(device, filter_mode=vk.VK_FILTER_NEAREST)
self._make_dummy_velocity(device)
# Until the renderer binds a real velocity view, sample the sentinel dummy.
self._velocity_view = self._dummy_vel_view
self._has_velocity = False
self._ubo_buf, self._ubo_mem = create_buffer(
device, e.ctx.physical_device, _UBO_SIZE,
vk.VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
vk.VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | vk.VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
)
fs = vk.VK_SHADER_STAGE_FRAGMENT_BIT
self._desc_layout = create_descriptor_set_layout(device, [
(0, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # current HDR
(1, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # history
(2, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # depth
(3, vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, fs, 1), # uniforms
(4, vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, fs, 1), # velocity
])
pool_sizes = [
vk.VkDescriptorPoolSize(type=vk.VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, descriptorCount=4),
vk.VkDescriptorPoolSize(type=vk.VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, descriptorCount=1),
]
self._desc_pool = vk.vkCreateDescriptorPool(device, vk.VkDescriptorPoolCreateInfo(
maxSets=1, poolSizeCount=len(pool_sizes), pPoolSizes=pool_sizes,
), None)
self._desc_set = allocate_descriptor_set(device, self._desc_pool, self._desc_layout)
self._write_descriptors()
shader_dir = e.shader_dir
self._vert_module = create_shader_module(device, compile_shader(shader_dir / "taa.vert"))
self._frag_module = create_shader_module(device, compile_shader(shader_dir / "taa.frag"))
self._create_pipeline(device, self._output.render_pass, (width, height))
self._ready = True
self._has_history = False
log.debug("TAA pass initialised (%dx%d)", width, height)
def _make_target(self, width: int, height: int, colour_format: int) -> RenderTarget:
e = self._engine
return RenderTarget(
e.ctx.device, e.ctx.physical_device, width, height,
colour_format=colour_format, use_depth=False,
queue=e.ctx.graphics_queue, command_pool=e.ctx.command_pool,
)
def _make_dummy_velocity(self, device: Any) -> None:
"""Create a 1x1 RG16F texture pre-filled with the velocity sentinel.
Binding 4 must always reference a valid view (even before the velocity
pass exists / on a frame with no moving meshes). Sampling this dummy
yields the sentinel (< -1.5), so the shader falls back to the depth-based
camera reprojection. Left in SHADER_READ_ONLY by ``upload_image_data``.
"""
e = self._engine
pixels = np.array([[_VELOCITY_SENTINEL, _VELOCITY_SENTINEL]], dtype=np.float16).reshape(1, 1, 2)
self._dummy_vel_image, self._dummy_vel_mem = upload_image_data(
device, e.ctx.physical_device, e.ctx.graphics_queue, e.ctx.command_pool,
np.ascontiguousarray(pixels), 1, 1, fmt=_VELOCITY_FORMAT,
)
self._dummy_vel_view = vk.vkCreateImageView(device, vk.VkImageViewCreateInfo(
image=self._dummy_vel_image, viewType=vk.VK_IMAGE_VIEW_TYPE_2D, format=_VELOCITY_FORMAT,
subresourceRange=vk.VkImageSubresourceRange(
aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, baseMipLevel=0, levelCount=1,
baseArrayLayer=0, layerCount=1),
), None)
def _write_descriptors(self) -> None:
"""Wire the resolve descriptor set.
Binding 0 = current (post-forward+fog) HDR colour, 1 = history (last
frame's resolved result), 2 = depth, 3 = uniforms, 4 = per-object
velocity (the VelocityPass RG16F view, or the sentinel dummy).
"""
with DescriptorWriteBatch(self._engine.ctx.device) as batch:
ds = self._desc_set
batch.image(ds, 0, self._cur_colour_view, self._sampler)
batch.image(ds, 1, self._history.colour_view, self._sampler)
batch.image(ds, 2, self._depth_view, self._depth_sampler,
image_layout=vk.VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL)
batch.uniform_buffer(ds, 3, self._ubo_buf, _UBO_SIZE)
batch.image(ds, 4, self._velocity_view or self._dummy_vel_view, self._sampler)
[docs]
def set_velocity_view(self, velocity_view: Any) -> None:
"""Point binding 4 at the per-object velocity buffer (or clear it).
Called each TAA frame by the renderer with ``VelocityPass.velocity_view``
(the RG16F per-object motion target). Passing ``None`` reverts to the
sentinel dummy (depth fallback everywhere). Only rewrites the descriptor
when the view actually changes, so the steady state is a flag-only update.
"""
if not self._ready:
return
target = velocity_view or self._dummy_vel_view
self._has_velocity = velocity_view is not None
if target is self._velocity_view:
return
self._velocity_view = target
self._write_descriptors()
[docs]
def set_frame_matrices(self, inv_vp: np.ndarray, prev_vp: np.ndarray) -> None:
"""Stash the current inverse VP + previous VP for the next ``render``."""
self._inv_vp = np.asarray(inv_vp, dtype=np.float32)
self._prev_vp = np.asarray(prev_vp, dtype=np.float32)
def _upload_uniforms(self) -> None:
data = np.zeros(_UBO_SIZE // 4, dtype=np.float32)
# mat4 inv_vp + mat4 prev_vp: row-major numpy -> column-major GLSL.
data[0:16] = self._inv_vp.T.ravel()
data[16:32] = self._prev_vp.T.ravel()
# On the first frame there is no valid history yet: force feedback_only
# so the resolve passes the current frame through (no stale-data smear).
feedback = 1.0 if (self.feedback_only or not self._has_history) else 0.0
inv_w = 1.0 / float(self._width) if self._width else 0.0
inv_h = 1.0 / float(self._height) if self._height else 0.0
data[32:36] = [float(self.blend_factor), feedback, inv_w, inv_h]
# params2.x = has_velocity: 1 when a real per-object velocity view is
# bound, so the shader samples it (and falls back per-pixel on sentinels).
data[36] = 1.0 if self._has_velocity else 0.0
upload_numpy(self._engine.ctx.device, self._ubo_mem, data)
# -- record --------------------------------------------------------------
[docs]
def render(self, cmd: Any) -> None:
"""Resolve TAA into the fixed output target, then blit it to history.
No-op when off. The output target's view is stable (tonemap always reads
it), so this pass owns no per-frame descriptor rewrite. After resolve the
output is copied into the history target so next frame can reproject it.
"""
if not self._ready or not self.enabled or not self._pipeline:
return
self._upload_uniforms()
rt = self._output
rp_begin = vk.VkRenderPassBeginInfo(
renderPass=rt.render_pass, framebuffer=rt.framebuffer,
renderArea=vk.VkRect2D(offset=vk.VkOffset2D(x=0, y=0),
extent=vk.VkExtent2D(width=rt.width, height=rt.height)),
clearValueCount=1,
pClearValues=[vk.VkClearValue(color=vk.VkClearColorValue(float32=[0.0, 0.0, 0.0, 1.0]))],
)
vk.vkCmdBeginRenderPass(cmd, rp_begin, vk.VK_SUBPASS_CONTENTS_INLINE)
vk.vkCmdSetViewport(cmd, 0, 1, [vk.VkViewport(
x=0.0, y=0.0, width=float(rt.width), height=float(rt.height), minDepth=0.0, maxDepth=1.0)])
vk.vkCmdSetScissor(cmd, 0, 1, [vk.VkRect2D(
offset=vk.VkOffset2D(x=0, y=0), extent=vk.VkExtent2D(width=rt.width, height=rt.height))])
vk.vkCmdBindPipeline(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS, self._pipeline)
vk.vkCmdBindDescriptorSets(cmd, vk.VK_PIPELINE_BIND_POINT_GRAPHICS,
self._pipeline_layout, 0, 1, [self._desc_set], 0, None)
vk.vkCmdDraw(cmd, 3, 1, 0, 0)
vk.vkCmdEndRenderPass(cmd)
# Copy the resolved output into the history target so next frame's
# reprojection samples it. Both images leave the render pass / start in
# SHADER_READ_ONLY_OPTIMAL; transition to transfer layouts, copy, restore.
self._blit_output_to_history(cmd)
# History is valid from the next frame onward.
self._has_history = True
def _blit_output_to_history(self, cmd: Any) -> None:
"""Copy the resolved output image into the history image (same size/fmt)."""
src = self._output.colour_image
dst = self._history.colour_image
w, h = self._width, self._height
def barrier(image, old, new, src_access, dst_access):
return vk.VkImageMemoryBarrier(
srcAccessMask=src_access, dstAccessMask=dst_access,
oldLayout=old, newLayout=new,
srcQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
dstQueueFamilyIndex=vk.VK_QUEUE_FAMILY_IGNORED,
image=image,
subresourceRange=vk.VkImageSubresourceRange(
aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT,
baseMipLevel=0, levelCount=1, baseArrayLayer=0, layerCount=1),
)
# SHADER_READ_ONLY -> transfer layouts.
vk.vkCmdPipelineBarrier(
cmd, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, 0,
0, None, 0, None, 2, [
barrier(src, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_READ_BIT),
barrier(dst, vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
vk.VK_ACCESS_SHADER_READ_BIT, vk.VK_ACCESS_TRANSFER_WRITE_BIT),
])
region = vk.VkImageCopy(
srcSubresource=vk.VkImageSubresourceLayers(
aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0,
baseArrayLayer=0, layerCount=1),
srcOffset=vk.VkOffset3D(x=0, y=0, z=0),
dstSubresource=vk.VkImageSubresourceLayers(
aspectMask=vk.VK_IMAGE_ASPECT_COLOR_BIT, mipLevel=0,
baseArrayLayer=0, layerCount=1),
dstOffset=vk.VkOffset3D(x=0, y=0, z=0),
extent=vk.VkExtent3D(width=w, height=h, depth=1),
)
vk.vkCmdCopyImage(
cmd, src, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
dst, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, [region])
# Restore both to SHADER_READ_ONLY for sampling (tonemap reads output,
# next-frame resolve reads history).
vk.vkCmdPipelineBarrier(
cmd, vk.VK_PIPELINE_STAGE_TRANSFER_BIT, vk.VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0,
0, None, 0, None, 2, [
barrier(src, vk.VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
vk.VK_ACCESS_TRANSFER_READ_BIT, vk.VK_ACCESS_SHADER_READ_BIT),
barrier(dst, vk.VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
vk.VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
vk.VK_ACCESS_TRANSFER_WRITE_BIT, vk.VK_ACCESS_SHADER_READ_BIT),
])
# -- pipeline ------------------------------------------------------------
def _create_pipeline(self, device: Any, render_pass: Any, extent: tuple[int, int]) -> None:
"""Create the fullscreen-triangle resolve pipeline.
Declares its fixed-function state via :class:`PipelineSpec` and defers
all cffi sub-struct plumbing (and lifetime management) to
:func:`build_pipeline`. Geometry is shader-generated (no vertex input),
depth test/write are off, opaque colour write, no push constants. The
shaders are compiled at runtime, so the pre-created modules are passed
directly rather than via SPIR-V paths in the spec.
"""
spec = PipelineSpec(
name="TAA",
topology=vk.VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
vertex_stride=0, # fullscreen triangle generated in the shader
cull_mode=vk.VK_CULL_MODE_NONE,
depth_test=False,
depth_write=False,
blend="opaque",
set_layouts=(self._desc_layout,),
)
self._pipeline, self._pipeline_layout = build_pipeline(
device, spec, render_pass, extent,
vert_module=self._vert_module, frag_module=self._frag_module,
)
# -- resize / cleanup ----------------------------------------------------
[docs]
def resize(self, width: int, height: int, cur_colour_view: Any, depth_view: Any,
colour_format: int) -> None:
if not self._ready:
return
self._width, self._height = width, height
self._colour_format = colour_format
self._cur_colour_view = cur_colour_view
self._depth_view = depth_view
if self._output:
self._output.destroy()
if self._history:
self._history.destroy()
self._output = self._make_target(width, height, colour_format)
self._history = self._make_target(width, height, colour_format)
# History is gone after a resize: start fresh so we don't reproject from
# a stale-resolution buffer.
self._has_history = False
# The velocity pass resizes too (its view changes): revert to the dummy
# until the renderer re-binds the new velocity view next TAA frame.
self._velocity_view = self._dummy_vel_view
self._has_velocity = False
self._write_descriptors()
[docs]
def cleanup(self) -> None:
if not self._ready:
return
device = self._engine.ctx.device
if self._pipeline:
vk.vkDestroyPipeline(device, self._pipeline, None)
if self._pipeline_layout:
vk.vkDestroyPipelineLayout(device, self._pipeline_layout, None)
if self._vert_module:
vk.vkDestroyShaderModule(device, self._vert_module, None)
if self._frag_module:
vk.vkDestroyShaderModule(device, self._frag_module, None)
if self._desc_pool:
vk.vkDestroyDescriptorPool(device, self._desc_pool, None)
if self._desc_layout:
vk.vkDestroyDescriptorSetLayout(device, self._desc_layout, None)
if self._sampler:
vk.vkDestroySampler(device, self._sampler, None)
if self._depth_sampler:
vk.vkDestroySampler(device, self._depth_sampler, None)
if self._dummy_vel_view:
vk.vkDestroyImageView(device, self._dummy_vel_view, None)
if self._dummy_vel_image:
vk.vkDestroyImage(device, self._dummy_vel_image, None)
if self._dummy_vel_mem:
vk.vkFreeMemory(device, self._dummy_vel_mem, None)
if self._ubo_buf:
vk.vkDestroyBuffer(device, self._ubo_buf, None)
if self._ubo_mem:
vk.vkFreeMemory(device, self._ubo_mem, None)
if self._output:
self._output.destroy()
if self._history:
self._history.destroy()
self._output = self._history = None
self._ready = False