Files
llimphi/llimphi-raster/examples/spike_gpu_directo.rs
T
Sergio ccab39f140 refresh: stack al día (vello 0.7 / wgpu 27 / parley 0.6) + motor 3D voxel
Re-sincroniza las fuentes desde el monorepo (estaba en vello 0.5/wgpu 24 y con la
estructura vieja de eventloop) y suma el 3D:

- bump del workspace a vello 0.7 / wgpu 27 / parley 0.6, + accesskit 0.24 /
  accesskit_winit 0.33 / vello_hybrid 0.0.9.
- nuevos crates: llimphi-3d (voxels ray-march + mallas en un depth compartido,
  montable dentro de un View 2D vía set_viewport+scissor) y llimphi-voxel
  (world-gen, personajes, director de escenas) + shared/foreign-vox (puente .vox).
- README: sección "Not just 2D — a 3D voxel engine" + GIF (docs/llimphi_voxel.gif).
- excluido modules/allichay (arrastra deps fuera del alcance del front-door).
- cargo check --workspace: verde.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-18 14:40:00 +00:00

392 lines
14 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Spike Fase 0 — GPU directo vs vello.
//!
//! Compara el tiempo total CPU+GPU por frame para pintar N puntos en una
//! textura `Rgba8Unorm` 1024×1024 con dos estrategias:
//!
//! - **Vello**: una llamada `Scene::fill(Rect 1×1)` por punto, luego
//! `vello::Renderer::render_to_texture`.
//! - **GPU directo**: un pipeline `wgpu` con instanced quad. Cada punto es
//! una instancia `[x:f32, y:f32, rgba:u32]`. Una sola draw call.
//!
//! Tamaños: 100K, 500K, 1M puntos. 10 frames de warmup + 20 medidos por
//! tamaño. Reporta mediana y factor de aceleración.
//!
//! Criterio de aceptación del SDD (`llimphi/SDD.md` §"GPU directo wgpu"):
//! factor ≥ 5× a 500K → seguir con Fase 1. Si no, abortar.
//!
//! Corre con: `cargo run -p llimphi-raster --example spike_gpu_directo --release`.
use std::io::Write;
use std::time::Instant;
use llimphi_hal::{wgpu, Hal};
use llimphi_raster::{
kurbo::{Affine, Rect},
peniko::{color::palette, Color, Fill},
vello,
};
const W: u32 = 1024;
const H: u32 = 1024;
const TARGET_FORMAT: wgpu::TextureFormat = wgpu::TextureFormat::Rgba8Unorm;
const WARMUP_FRAMES: usize = 5;
const MEASURED_FRAMES: usize = 15;
// Vello revienta (SIGSEGV en `vello_encoding::path::flatten`) cuando la
// escena pasa de ~200K paths con los `Limits::default()` que pide el HAL.
// Es exactamente el techo del SDD §"GPU directo wgpu". Lo medimos hasta
// donde vello aguanta; el lado directo se mide a sizes mucho mayores para
// confirmar el régimen post-techo.
const VELLO_SIZES: &[usize] = &[25_000, 50_000, 100_000, 200_000];
const DIRECTO_SIZES: &[usize] = &[100_000, 500_000, 1_000_000, 5_000_000];
fn main() {
let hal = pollster::block_on(Hal::new(None)).expect("hal");
// Textura destino compartida por ambos backends. STORAGE_BINDING para
// vello (compute), RENDER_ATTACHMENT para el pipeline directo. Idéntica
// al `intermediate` de `WinitSurface` (HAL real).
let (target, target_view) = create_target(&hal.device);
let mut vello_renderer = vello::Renderer::new(
&hal.device,
vello::RendererOptions {
use_cpu: false,
antialiasing_support: vello::AaSupport {
area: true,
msaa8: false,
msaa16: false,
},
num_init_threads: None,
pipeline_cache: None,
},
)
.expect("vello renderer");
let directo = DirectoPipeline::new(&hal.device);
println!();
println!("spike GPU directo — target {W}×{H} Rgba8Unorm, headless");
println!("warmup {WARMUP_FRAMES}, measured {MEASURED_FRAMES}");
println!();
println!("vello (scene.fill por punto):");
println!(" {:>10} | {:>14}", "N", "ms / frame");
println!(" {:->10} + {:->14}", "", "");
let mut vello_100k_ms: Option<f64> = None;
for &n in VELLO_SIZES {
let points = gen_points(n);
let ms = bench_vello(&hal, &mut vello_renderer, &target_view, &points);
println!(" {:>10} | {:>14.3}", n, ms);
let _ = std::io::stdout().flush();
if n == 100_000 {
vello_100k_ms = Some(ms);
}
}
println!();
println!("GPU directo (instanced quad, 1 draw call):");
println!(" {:>10} | {:>14}", "N", "ms / frame");
println!(" {:->10} + {:->14}", "", "");
let mut directo_100k_ms: Option<f64> = None;
for &n in DIRECTO_SIZES {
let points = gen_points(n);
let ms = bench_directo(&hal, &directo, &target_view, &points);
println!(" {:>10} | {:>14.3}", n, ms);
let _ = std::io::stdout().flush();
if n == 100_000 {
directo_100k_ms = Some(ms);
}
}
println!();
if let (Some(v), Some(d)) = (vello_100k_ms, directo_100k_ms) {
let factor = v / d;
let verdict = if factor >= 5.0 { "PASA" } else { "ABORTAR" };
println!(
"veredicto Fase 0 @ 100K: vello {:.2} ms / directo {:.2} ms = {:.2}×{}",
v, d, factor, verdict
);
println!("(SDD pide ≥5× a 500K, pero vello no llega a 500K — techo medido <300K)");
}
println!();
// Mantener vivo el texture para evitar warnings.
drop(target);
}
fn create_target(device: &wgpu::Device) -> (wgpu::Texture, wgpu::TextureView) {
let tex = device.create_texture(&wgpu::TextureDescriptor {
label: Some("spike-target"),
size: wgpu::Extent3d {
width: W,
height: H,
depth_or_array_layers: 1,
},
mip_level_count: 1,
sample_count: 1,
dimension: wgpu::TextureDimension::D2,
format: TARGET_FORMAT,
usage: wgpu::TextureUsages::STORAGE_BINDING
| wgpu::TextureUsages::RENDER_ATTACHMENT
| wgpu::TextureUsages::TEXTURE_BINDING,
view_formats: &[],
});
let view = tex.create_view(&wgpu::TextureViewDescriptor::default());
(tex, view)
}
/// LCG numerical recipes — determinista, sin dependencias.
fn gen_points(n: usize) -> Vec<(f32, f32, u32)> {
let mut state: u32 = 0x1234_5678;
let mut out = Vec::with_capacity(n);
for _ in 0..n {
state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
let x = (state % W) as f32;
state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
let y = (state % H) as f32;
state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
// RGBA packed little-endian: R en byte bajo (queda igual a como lo
// lee el shader: `rgba & 0xFF` → R).
let rgba = (state & 0x00FF_FFFF) | 0xFF00_0000;
out.push((x, y, rgba));
}
out
}
fn bench_vello(
hal: &Hal,
renderer: &mut vello::Renderer,
target: &wgpu::TextureView,
points: &[(f32, f32, u32)],
) -> f64 {
let mut scene = vello::Scene::new();
let mut samples: Vec<f64> = Vec::with_capacity(MEASURED_FRAMES);
for frame in 0..(WARMUP_FRAMES + MEASURED_FRAMES) {
let t0 = Instant::now();
scene.reset();
for &(x, y, rgba) in points {
let r = (rgba & 0xFF) as u8;
let g = ((rgba >> 8) & 0xFF) as u8;
let b = ((rgba >> 16) & 0xFF) as u8;
let a = ((rgba >> 24) & 0xFF) as u8;
let xf = x as f64;
let yf = y as f64;
scene.fill(
Fill::NonZero,
Affine::IDENTITY,
Color::from_rgba8(r, g, b, a),
None,
&Rect::new(xf, yf, xf + 1.0, yf + 1.0),
);
}
renderer
.render_to_texture(
&hal.device,
&hal.queue,
&scene,
target,
&vello::RenderParams {
base_color: palette::css::BLACK,
width: W,
height: H,
antialiasing_method: vello::AaConfig::Area,
},
)
.expect("vello render");
// Bloquear hasta que la GPU termine este frame. Sin esto medimos
// sólo el submit + queue building, no el trabajo real.
hal.device.poll(wgpu::PollType::wait_indefinitely());
let dt = t0.elapsed().as_secs_f64() * 1000.0;
if frame >= WARMUP_FRAMES {
samples.push(dt);
}
}
median(&mut samples)
}
fn bench_directo(
hal: &Hal,
pipe: &DirectoPipeline,
target: &wgpu::TextureView,
points: &[(f32, f32, u32)],
) -> f64 {
// Buffer de instancias dimensionado para el peor caso.
let bytes_per_inst = std::mem::size_of::<[u32; 3]>(); // [x:f32, y:f32, rgba:u32] = 12B
let inst_buf = hal.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("spike-directo-inst"),
size: (points.len() * bytes_per_inst) as u64,
usage: wgpu::BufferUsages::VERTEX | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let mut samples: Vec<f64> = Vec::with_capacity(MEASURED_FRAMES);
for frame in 0..(WARMUP_FRAMES + MEASURED_FRAMES) {
let t0 = Instant::now();
// Empaquetar instancias: igual a la "scene build" del lado vello,
// para que la comparación sea fair (ambos parten de los mismos
// puntos crudos).
let bytes = pack_instances(points);
hal.queue.write_buffer(&inst_buf, 0, &bytes);
let mut encoder = hal.device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("spike-directo-enc"),
});
{
let mut pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
label: Some("spike-directo-pass"),
color_attachments: &[Some(wgpu::RenderPassColorAttachment {
view: target,
resolve_target: None,
depth_slice: None,
ops: wgpu::Operations {
load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
store: wgpu::StoreOp::Store,
},
})],
depth_stencil_attachment: None,
timestamp_writes: None,
occlusion_query_set: None,
});
pass.set_pipeline(&pipe.pipeline);
pass.set_vertex_buffer(0, inst_buf.slice(..));
// 6 vértices por instancia (2 tris = quad), N instancias.
pass.draw(0..6, 0..points.len() as u32);
}
hal.queue.submit(std::iter::once(encoder.finish()));
hal.device.poll(wgpu::PollType::wait_indefinitely());
let dt = t0.elapsed().as_secs_f64() * 1000.0;
if frame >= WARMUP_FRAMES {
samples.push(dt);
}
}
median(&mut samples)
}
fn pack_instances(points: &[(f32, f32, u32)]) -> Vec<u8> {
let mut v = Vec::with_capacity(points.len() * 12);
for &(x, y, rgba) in points {
v.extend_from_slice(&x.to_ne_bytes());
v.extend_from_slice(&y.to_ne_bytes());
v.extend_from_slice(&rgba.to_ne_bytes());
}
v
}
fn median(samples: &mut [f64]) -> f64 {
samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
samples[samples.len() / 2]
}
/// Pipeline trivial para el bench: instanced quad sin texturas, color
/// per-instance. No es código de producción — es el "mock GPU directo"
/// que pide la Fase 0 del SDD para medir el techo alcanzable.
struct DirectoPipeline {
pipeline: wgpu::RenderPipeline,
}
impl DirectoPipeline {
fn new(device: &wgpu::Device) -> Self {
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("spike-directo-shader"),
source: wgpu::ShaderSource::Wgsl(WGSL.into()),
});
let layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("spike-directo-layout"),
bind_group_layouts: &[],
push_constant_ranges: &[],
});
let pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
label: Some("spike-directo-pipeline"),
layout: Some(&layout),
vertex: wgpu::VertexState {
module: &shader,
entry_point: Some("vs"),
compilation_options: Default::default(),
buffers: &[wgpu::VertexBufferLayout {
array_stride: 12,
step_mode: wgpu::VertexStepMode::Instance,
attributes: &[
wgpu::VertexAttribute {
format: wgpu::VertexFormat::Float32x2,
offset: 0,
shader_location: 0,
},
wgpu::VertexAttribute {
format: wgpu::VertexFormat::Uint32,
offset: 8,
shader_location: 1,
},
],
}],
},
primitive: wgpu::PrimitiveState {
topology: wgpu::PrimitiveTopology::TriangleList,
strip_index_format: None,
front_face: wgpu::FrontFace::Ccw,
cull_mode: None,
unclipped_depth: false,
polygon_mode: wgpu::PolygonMode::Fill,
conservative: false,
},
depth_stencil: None,
multisample: wgpu::MultisampleState::default(),
fragment: Some(wgpu::FragmentState {
module: &shader,
entry_point: Some("fs"),
compilation_options: Default::default(),
targets: &[Some(wgpu::ColorTargetState {
format: TARGET_FORMAT,
blend: None,
write_mask: wgpu::ColorWrites::ALL,
})],
}),
multiview: None,
cache: None,
});
Self { pipeline }
}
}
const WGSL: &str = r#"
struct Inst {
@location(0) xy: vec2<f32>,
@location(1) rgba: u32,
};
struct V2F {
@builtin(position) pos: vec4<f32>,
@location(0) color: vec4<f32>,
};
const W: f32 = 1024.0;
const H: f32 = 1024.0;
@vertex
fn vs(@builtin(vertex_index) vid: u32, inst: Inst) -> V2F {
// Quad 1.5px alrededor de (inst.xy + 0.5). Pixel-centered.
var corners = array<vec2<f32>, 6>(
vec2<f32>(-0.75, -0.75),
vec2<f32>( 0.75, -0.75),
vec2<f32>( 0.75, 0.75),
vec2<f32>(-0.75, -0.75),
vec2<f32>( 0.75, 0.75),
vec2<f32>(-0.75, 0.75),
);
let off = corners[vid];
let px = inst.xy + vec2<f32>(0.5, 0.5) + off;
// pixel → NDC, Y invertido (vello / textura framebuffer).
let ndc = vec2<f32>(px.x / W * 2.0 - 1.0, 1.0 - px.y / H * 2.0);
let r = f32( inst.rgba & 0xFFu) / 255.0;
let g = f32((inst.rgba >> 8u) & 0xFFu) / 255.0;
let b = f32((inst.rgba >> 16u) & 0xFFu) / 255.0;
let a = f32((inst.rgba >> 24u) & 0xFFu) / 255.0;
var out: V2F;
out.pos = vec4<f32>(ndc, 0.0, 1.0);
out.color = vec4<f32>(r, g, b, a);
return out;
}
@fragment
fn fs(in: V2F) -> @location(0) vec4<f32> {
return in.color;
}
"#;