From e0ad7315be6df72431eb7bd621c96f8b84c58755 Mon Sep 17 00:00:00 2001 From: sergio Date: Wed, 20 May 2026 15:53:43 +0000 Subject: [PATCH] =?UTF-8?q?feat(verbo):=20verbo-mock=20=E2=80=94=20backend?= =?UTF-8?q?=20de=20embeddings=20determinista?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend sin modelo real: FNV-1a del texto siembra un LCG que genera el vector. Mismo texto → mismo vector siempre; textos distintos → vectores distintos. Dimensión configurable (default 384d, típica de modelos ligeros). Desbloquea desarrollar y testear los consumidores de verbo (fana-semantic, badu, chasqui) sin descargar modelos ONNX ni pegarle a Cohere. Los backends reales (cohere/bge/fastembed) son swaps de config. 4 tests verdes (determinismo, distinción, dimensión, batch). cargo check --workspace verde. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 9 ++ Cargo.toml | 1 + crates/modules/verbo/verbo-mock/Cargo.toml | 15 +++ crates/modules/verbo/verbo-mock/src/lib.rs | 113 +++++++++++++++++++++ 4 files changed, 138 insertions(+) create mode 100644 crates/modules/verbo/verbo-mock/Cargo.toml create mode 100644 crates/modules/verbo/verbo-mock/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 71d953a..9910d8d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13207,6 +13207,15 @@ dependencies = [ "thiserror 2.0.18", ] +[[package]] +name = "verbo-mock" +version = "0.1.0" +dependencies = [ + "async-trait", + "tokio", + "verbo-core", +] + [[package]] name = "version_check" version = "0.9.5" diff --git a/Cargo.toml b/Cargo.toml index 0aa2af7..186a6c9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,6 +116,7 @@ members = [ # modules/verbo/ — Provider de embeddings model-agnostic # ============================================================ "crates/modules/verbo/verbo-core", + "crates/modules/verbo/verbo-mock", # ============================================================ # modules/nakui/ — ERP matemático (categórico) diff --git a/crates/modules/verbo/verbo-mock/Cargo.toml b/crates/modules/verbo/verbo-mock/Cargo.toml new file mode 100644 index 0000000..d9d0335 --- /dev/null +++ b/crates/modules/verbo/verbo-mock/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "verbo-mock" +version.workspace = true +edition.workspace = true +license.workspace = true +authors.workspace = true +publish.workspace = true +description = "verbo — backend de embeddings determinista (sin modelo real). Mismo texto → mismo vector. Para desarrollar y testear consumidores sin descargar modelos." + +[dependencies] +verbo-core = { path = "../verbo-core" } +async-trait = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true } diff --git a/crates/modules/verbo/verbo-mock/src/lib.rs b/crates/modules/verbo/verbo-mock/src/lib.rs new file mode 100644 index 0000000..9754ffc --- /dev/null +++ b/crates/modules/verbo/verbo-mock/src/lib.rs @@ -0,0 +1,113 @@ +//! `verbo-mock` — backend de embeddings determinista. +//! +//! No carga ningún modelo: hashea el texto y genera el vector con un LCG +//! sembrado por ese hash. Mismo texto → mismo vector, siempre. Textos +//! distintos → vectores distintos. Sirve para desarrollar y testear los +//! consumidores de `verbo` (fana-semantic, badu, chasqui) sin descargar +//! modelos ONNX ni pegarle a la API de Cohere. + +#![forbid(unsafe_code)] + +use async_trait::async_trait; +use verbo_core::{EmbedError, EmbeddingVector, ModelId, Provider}; + +/// Proveedor determinista. La dimensión es configurable. +pub struct MockProvider { + model: ModelId, +} + +impl MockProvider { + /// Crea un proveedor mock de la dimensión dada. + pub fn new(dimension: usize) -> Self { + Self { + model: ModelId::new(format!("verbo-mock-{dimension}d"), dimension), + } + } +} + +impl Default for MockProvider { + /// Mock de 384d — la dimensión típica de los modelos ligeros (MiniLM). + fn default() -> Self { + Self::new(384) + } +} + +/// FNV-1a de 64 bits sobre los bytes del texto. +fn fnv1a(text: &str) -> u64 { + let mut h: u64 = 0xcbf2_9ce4_8422_2325; + for b in text.bytes() { + h ^= b as u64; + h = h.wrapping_mul(0x100_0000_01b3); + } + h +} + +/// Genera `dim` valores en `[-1, 1)` con un LCG sembrado por `seed`. +fn lcg_vector(seed: u64, dim: usize) -> Vec { + let mut state = seed; + let mut out = Vec::with_capacity(dim); + for _ in 0..dim { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + // bits altos → f32 en [0,1) → reescalado a [-1,1). + let unit = (state >> 40) as f32 / (1u64 << 24) as f32; + out.push(unit * 2.0 - 1.0); + } + out +} + +#[async_trait] +impl Provider for MockProvider { + fn model_id(&self) -> &ModelId { + &self.model + } + + async fn embed(&self, text: &str) -> Result { + let values = lcg_vector(fnv1a(text), self.model.dimension); + EmbeddingVector::new(self.model.clone(), values) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn same_text_yields_same_vector() { + let p = MockProvider::new(16); + let a = p.embed("hola mundo").await.unwrap(); + let b = p.embed("hola mundo").await.unwrap(); + assert_eq!(a.values, b.values); + } + + #[tokio::test] + async fn different_text_yields_different_vector() { + let p = MockProvider::new(64); + let a = p.embed("alpha").await.unwrap(); + let b = p.embed("beta").await.unwrap(); + assert_ne!(a.values, b.values); + // Y son comparables (mismo modelo). + assert!(a.cosine(&b).is_ok()); + } + + #[tokio::test] + async fn vector_has_configured_dimension() { + let p = MockProvider::new(384); + let v = p.embed("x").await.unwrap(); + assert_eq!(v.values.len(), 384); + assert_eq!(v.model.dimension, 384); + } + + #[tokio::test] + async fn batch_matches_individual() { + let p = MockProvider::new(32); + let batch = p + .embed_batch(&["uno".into(), "dos".into()]) + .await + .unwrap(); + let single = p.embed("uno").await.unwrap(); + assert_eq!(batch[0].values, single.values); + assert_eq!(batch.len(), 2); + } +}