diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f7cc2f..0585b6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,96 @@ ratio/diff ver `git show `. ## 2026-05-09 +### feat(minga-core): α-hashing per-language para Python, TypeScript, JavaScript, Go +Cierra el último pendiente fundamentado del CHANGELOG. Cada lenguaje +soportado por `minga` tiene ahora su propio profile α-equivalente — +dos versiones del mismo programa que difieren sólo en nombres de +variables ligadas producen el mismo hash, no importa el lenguaje. +Refactorings tipo "rename variable" no inflan el storage del repo +en ningún dialecto. + +Refactor de `alpha.rs` (639 LOC) a módulo `alpha/`: +- **`alpha/common.rs`**: primitives compartidos (TAG_*, write_kind_and_field, + emit_leaf_marker, emit_binder_body, emit_identifier_ref, push_identifier_name). + Garantiza que el formato wire del hash sea bit-equivalente entre + todos los profiles. +- **`alpha/rust.rs`**: la lógica de Rust (movida desde alpha.rs sin + cambios funcionales). +- **`alpha/python.rs`**: nuevo. +- **`alpha/ecmascript.rs`**: nuevo (cubre TypeScript + JavaScript; + comparten la mayoría de los kinds). +- **`alpha/go.rs`**: nuevo. +- **`alpha/mod.rs`**: re-exporta `hash_node_alpha` (Rust legacy) + + expone `hash_alpha_with(dialect, node)` que despacha al profile + correspondiente. + +Cobertura per-language: + +**Python** (`def`, `lambda`, `for`, comprehensions, `with`): +- `function_definition` y `lambda`: parámetros (incluyendo + typed_parameter, default_parameter, *args, **kwargs) introducen + binders al body. El nombre de la función NO es α-anónimo. +- `for_statement`: el `left` (identifier o tuple) introduce + binder(es) al body. +- `list_comprehension`, `set_comprehension`, `dictionary_comprehension`, + `generator_expression`: cada `for_in_clause` añade binders que + viven en el body + clauses siguientes (semántica de scope + incremental de Python). +- `with_statement`: `as` introduce binder al body (recursando en + `as_pattern_target` para llegar al identifier). + +**ECMAScript** (TS + JS): +- `function_declaration`, `function_expression`, `method_definition`, + `generator_function_*`: parameters → body. Soporta TS + `required_parameter` y `optional_parameter` (`x: number`, + `x?: number`). +- `arrow_function`: tanto `(x, y) => body` como shorthand `x => body`. +- `statement_block`: `lexical_declaration` (let/const) y + `variable_declaration` (var) introducen binders al resto del block. +- `for_in_statement` (cubre `for-of` y `for-in`): `left` → body. +- `for_statement` (C-style): initializer (lexical decl) introduce + binders al condition + increment + body. +- `catch_clause`: parameter → body. + +**Go**: +- `function_declaration`, `method_declaration`, `func_literal` (closure): + `parameter_list` → body. `parameter_declaration` con varios names + agrupa varios binders bajo un mismo tipo (`a, b int`). +- `block`: `short_var_declaration` (`x := ...`) introduce binders + al resto. +- `for_statement` con `range_clause` (`for k, v := range m`): los + identifiers del `left` son binders al body. +- `for_statement` con `for_clause` (C-style): initializer → body. +- `if_statement` con `initializer` (`if x := init(); x > 0`): + binders viven en condition + consequence + alternative. + +API: +- `hash_alpha_with(Dialect, &SemanticNode) -> ContentHash` — + despacho per-dialect. +- `hash_node_alpha(&SemanticNode) -> ContentHash` — alias histórico + asume Rust (back-compat). + +Tests: 26 nuevos en `tests/alpha_polyglot.rs`: +- Python (9): def rename, lambda rename, for-loop rename, list comp, + nested comp, with rename, function name matters, iterable name + matters, sanity negativo (operación distinta → hash distinto). +- JS/TS (9): function rename, function name matters, arrow rename, + arrow shorthand rename, let/const rename, for-of rename, classic + for rename, catch rename, TS typed param rename, TS type matters. +- Go (6): function rename, function name matters, short var decl + rename, range_clause rename, if-init rename, func_literal closure + rename. +- Cross-language (1): mismos shapes en lenguajes distintos + producen hashes distintos (sanity para evitar colisiones). + +141 tests verdes en minga-core (115 antes; +26 polyglot). Refactor +sin regresión: 36 α-Rust tests siguen pasando. + +Pendientes que quedan en Minga (orden de prioridad): +- `minga-vfs` FUSE (proyecto independiente, scope grande). +- Cobertura adicional por-lenguaje: Python class, JS destructuring, + Go type_switch, etc. — cada uno pequeño, no urgente. + ### feat(minga-core): cierre del α-hashing de Rust — if let, while let, let-else, or-pattern, let-chains Cierra los 5 pendientes documentados en `alpha.rs`. El hash α-equivalente ahora es estable bajo renombre de TODOS los binders diff --git a/crates/modules/semantic_dht/minga-core/src/alpha/common.rs b/crates/modules/semantic_dht/minga-core/src/alpha/common.rs new file mode 100644 index 0000000..ebb7026 --- /dev/null +++ b/crates/modules/semantic_dht/minga-core/src/alpha/common.rs @@ -0,0 +1,105 @@ +//! Primitives compartidos entre todos los profiles α-hashing. +//! +//! Cada profile per-language (rust, python, ecmascript, go) tiene su +//! propia lógica de "qué nodos introducen binders" y "cómo distinguir +//! binders de constructors". Pero el formato del wire del hash +//! (TAG_LEAF, TAG_BINDER, índice de Bruijn) es universal: lo emitimos +//! desde acá para garantizar que dos lenguajes con la misma +//! estructura semántica produzcan hashes comparables a nivel de bits. + +use crate::ast::SemanticNode; +use blake3::Hasher; + +pub const TAG_NO_LEAF: u8 = 0; +pub const TAG_LEAF: u8 = 1; +pub const TAG_BINDER: u8 = 2; +pub const TAG_REF_BOUND: u8 = 3; +pub const TAG_REF_FREE: u8 = 4; + +/// Emite el kind del nodo + presencia/ausencia de field_name. +pub fn write_kind_and_field(h: &mut Hasher, node: &SemanticNode) { + write_str(h, &node.kind); + match &node.field_name { + Some(f) => { + h.update(&[1]); + write_str(h, f); + } + None => { + h.update(&[0]); + } + } +} + +pub fn write_str(h: &mut Hasher, s: &str) { + h.update(&(s.len() as u64).to_le_bytes()); + h.update(s.as_bytes()); +} + +/// Emite el marker de leaf: TAG_LEAF + bytes del leaf si lo hay, +/// TAG_NO_LEAF si no. +pub fn emit_leaf_marker(h: &mut Hasher, node: &SemanticNode) { + match &node.leaf_text { + Some(t) => { + h.update(&[TAG_LEAF]); + h.update(&(t.len() as u64).to_le_bytes()); + h.update(t); + } + None => { + h.update(&[TAG_NO_LEAF]); + } + } +} + +/// Emite un binder anónimo: el contenido textual NO afecta el hash. +/// Esta es la primitiva de α-equivalencia: dos términos que sólo +/// difieren en nombres de variables ligadas hashean idénticos. +pub fn emit_binder_body(h: &mut Hasher) { + h.update(&[TAG_NO_LEAF]); + h.update(&[TAG_BINDER]); + h.update(&[0u8; 8]); +} + +/// Emite el kind del nodo + binder body. Atajo para nodos cuyo único +/// rol es ser binder (e.g. un identifier en posición de pattern). +pub fn emit_binder_node(h: &mut Hasher, node: &SemanticNode) { + write_kind_and_field(h, node); + emit_binder_body(h); +} + +/// Emite un identifier referencia: si está en scope, índice de +/// Bruijn (offset desde la cima); si no, nombre literal (variable +/// libre). +pub fn emit_identifier_ref(h: &mut Hasher, node: &SemanticNode, scope: &[String]) { + h.update(&[TAG_NO_LEAF]); + if let Some(t) = &node.leaf_text { + if let Ok(name) = std::str::from_utf8(t) { + if let Some(i) = scope.iter().rposition(|n| n == name) { + let de_bruijn = (scope.len() - 1 - i) as u64; + h.update(&[TAG_REF_BOUND]); + h.update(&de_bruijn.to_le_bytes()); + } else { + h.update(&[TAG_REF_FREE]); + h.update(&(t.len() as u64).to_le_bytes()); + h.update(t); + } + } else { + h.update(&[TAG_REF_FREE]); + h.update(&(t.len() as u64).to_le_bytes()); + h.update(t); + } + } else { + h.update(&[TAG_REF_FREE]); + h.update(&[0u8; 8]); + } + h.update(&[0u8; 8]); +} + +/// Push el nombre del identifier al vector de binders, si tiene +/// leaf_text válido. Helper común para todos los `collect_binders`. +pub fn push_identifier_name(node: &SemanticNode, out: &mut Vec) { + if let Some(t) = &node.leaf_text { + if let Ok(s) = std::str::from_utf8(t) { + out.push(s.to_string()); + } + } +} diff --git a/crates/modules/semantic_dht/minga-core/src/alpha/ecmascript.rs b/crates/modules/semantic_dht/minga-core/src/alpha/ecmascript.rs new file mode 100644 index 0000000..7b89864 --- /dev/null +++ b/crates/modules/semantic_dht/minga-core/src/alpha/ecmascript.rs @@ -0,0 +1,365 @@ +//! α-hashing per-language para JavaScript / TypeScript. +//! +//! Las dos gramáticas comparten la mayoría de los kinds (TypeScript +//! es JS + type annotations), así que un solo profile las cubre. El +//! caller (`hash_alpha_with`) despacha tanto `Dialect::JavaScript` +//! como `Dialect::TypeScript` acá. +//! +//! Cobertura: +//! - **`function_declaration`**, **`function_expression`**, +//! **`method_definition`**, **`generator_function_declaration`**: +//! parameters introducen binders al body. +//! - **`arrow_function`**: parameters (formal_parameters O identifier +//! directo si es shorthand `x => ...`) introducen binder(es) al body. +//! - **`statement_block`**: cualquier `lexical_declaration` (let/const) +//! o `variable_declaration` (var) dentro del block introduce binders +//! al resto del block. +//! - **`for_in_statement`** (cubre tanto `for (x in obj)` como +//! `for (x of arr)` en tree-sitter-javascript): el `left` es +//! binder al `body`. +//! - **`for_statement`**: el `initializer` (lexical_declaration) +//! introduce binder(es) al `condition`, `increment` y `body`. +//! - **`catch_clause`**: el `parameter` introduce binder al `body`. +//! +//! TypeScript-specific: `type` annotations (`x: number`) viajan como +//! children con field=type que se feedean por el path normal — el +//! tipo afecta el hash (cambiar de `number` a `string` rompe +//! α-equivalencia, intencionalmente). +//! +//! Pendientes (scope acotado): +//! - Destructuring (`const {a, b} = obj`, `const [x, y] = arr`). +//! - Class fields y constructor con `this.x = ...`. +//! - Hoisting de `var` a function scope (hoy se trata como block-scoped). + +use crate::alpha::common::{ + emit_binder_body, emit_identifier_ref, emit_leaf_marker, push_identifier_name, + write_kind_and_field, TAG_NO_LEAF, +}; +use crate::ast::SemanticNode; +use crate::cas::ContentHash; +use blake3::Hasher; + +pub fn hash_node_alpha_ecmascript(node: &SemanticNode) -> ContentHash { + let mut h = Hasher::new(); + let mut scope: Vec = Vec::new(); + feed(&mut h, node, &mut scope); + ContentHash(*h.finalize().as_bytes()) +} + +fn feed(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + match node.kind.as_str() { + "function_declaration" + | "function_expression" + | "generator_function_declaration" + | "generator_function" + | "method_definition" => feed_callable(h, node, scope), + "arrow_function" => feed_arrow(h, node, scope), + "statement_block" => feed_block(h, node, scope), + "for_in_statement" => feed_for_in(h, node, scope), + "for_statement" => feed_for(h, node, scope), + "catch_clause" => feed_catch(h, node, scope), + // Lexical declarations dispatcheadas también desde feed + // general, no sólo desde feed_block. Necesario para + // for_statement (initializer) y otros contextos donde una + // declaration aparece sin ser hijo directo de un block. + "lexical_declaration" | "variable_declaration" => feed_var_decl(h, node, scope), + "identifier" => emit_identifier_ref(h, node, scope), + _ => feed_default(h, node, scope), + } +} + +fn feed_default(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + emit_leaf_marker(h, node); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + feed(h, c, scope); + } +} + +/// Callable estándar: parameters → body. +fn feed_callable(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("parameters") { + collect_formal_param_binders(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("parameters") => feed_formal_params(h, c, scope), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +/// Arrow function: dos formas. `x => body` (single identifier) o +/// `(x, y) => body` (formal_parameters). Detectamos cuál. +fn feed_arrow(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + match c.field_name.as_deref() { + Some("parameter") => { + // `x => ...` — el identifier solo. + if c.kind == "identifier" { + push_identifier_name(c, &mut binders); + } + } + Some("parameters") => { + collect_formal_param_binders(c, &mut binders); + } + _ => {} + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("parameter") => emit_arrow_single_binder(h, c), + Some("parameters") => feed_formal_params(h, c, scope), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +fn emit_arrow_single_binder(h: &mut Hasher, node: &SemanticNode) { + write_kind_and_field(h, node); + if node.kind == "identifier" { + emit_binder_body(h); + } else { + // Otra forma (rare); fallback al feed normal sin binder. + emit_leaf_marker(h, node); + h.update(&(node.children.len() as u64).to_le_bytes()); + } +} + +/// Statement block: `let`/`const`/`var` declarations introducen +/// binders al resto del block (lexical scope). +fn feed_block(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let scope_before = scope.len(); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.kind.as_str() { + "lexical_declaration" | "variable_declaration" => { + feed_var_decl(h, c, scope); + collect_var_decl_binders(c, scope); + } + _ => feed(h, c, scope), + } + } + scope.truncate(scope_before); +} + +/// Procesa una let/const/var declaration: el `value` se evalúa en el +/// scope previo (los binders aún no existen para sí mismos); el +/// `name` se emite como binder anónimo. +fn feed_var_decl(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.kind == "variable_declarator" { + feed_declarator(h, c, scope); + } else { + feed(h, c, scope); + } + } +} + +fn feed_declarator(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("name") if c.kind == "identifier" => emit_named_binder(h, c), + _ => feed(h, c, scope), + } + } +} + +fn collect_var_decl_binders(node: &SemanticNode, out: &mut Vec) { + for c in &node.children { + if c.kind == "variable_declarator" { + for cc in &c.children { + if cc.field_name.as_deref() == Some("name") && cc.kind == "identifier" { + push_identifier_name(cc, out); + } + } + } + } +} + +/// `for (x of arr)` o `for (x in obj)`. left = identifier (con +/// posible kind=const/let prefix para lexical decl), right = expr, +/// body = block. +fn feed_for_in(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("left") && c.kind == "identifier" { + push_identifier_name(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("left") if c.kind == "identifier" => emit_named_binder(h, c), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +/// `for (let i = 0; i < n; i++) { body }`. El initializer (lexical +/// decl) introduce binders que viven en condition + increment + body. +fn feed_for(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("initializer") + && (c.kind == "lexical_declaration" || c.kind == "variable_declaration") + { + collect_var_decl_binders(c, &mut binders); + } + } + + let scope_before = scope.len(); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("initializer") => { + feed(h, c, scope); + // Tras procesar el initializer extendemos scope para + // que condition/increment/body lo vean. + scope.extend(binders.iter().cloned()); + } + _ => feed(h, c, scope), + } + } + scope.truncate(scope_before); +} + +/// `catch (e) { body }`. parameter es identifier → binder al body. +fn feed_catch(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("parameter") && c.kind == "identifier" { + push_identifier_name(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("parameter") if c.kind == "identifier" => emit_named_binder(h, c), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +/// formal_parameters de function declarations. Soporta: +/// - `identifier` (param simple). +/// - `required_parameter` (TypeScript: `x: number`). +/// - `optional_parameter` (TypeScript: `x?: number`). +/// - `rest_pattern` / `rest_parameter` (`...rest`). +fn feed_formal_params(h: &mut Hasher, params: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, params); + h.update(&[TAG_NO_LEAF]); + h.update(&(params.children.len() as u64).to_le_bytes()); + for c in ¶ms.children { + match c.kind.as_str() { + "identifier" => emit_named_binder(h, c), + "required_parameter" | "optional_parameter" => { + feed_typed_param(h, c, scope); + } + "rest_pattern" | "rest_parameter" => { + feed_rest_param(h, c, scope); + } + _ => feed(h, c, scope), + } + } +} + +fn feed_typed_param(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + let mut named_binder = false; + for c in &node.children { + if !named_binder && c.kind == "identifier" { + emit_named_binder(h, c); + named_binder = true; + } else { + feed(h, c, scope); + } + } +} + +fn feed_rest_param(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.kind == "identifier" { + emit_named_binder(h, c); + } else { + feed(h, c, scope); + } + } +} + +fn collect_formal_param_binders(params: &SemanticNode, out: &mut Vec) { + for c in ¶ms.children { + match c.kind.as_str() { + "identifier" => push_identifier_name(c, out), + "required_parameter" | "optional_parameter" | "rest_pattern" | "rest_parameter" => { + if let Some(ident) = c.children.iter().find(|cc| cc.kind == "identifier") { + push_identifier_name(ident, out); + } + } + _ => {} + } + } +} + +fn emit_named_binder(h: &mut Hasher, node: &SemanticNode) { + write_kind_and_field(h, node); + emit_binder_body(h); +} diff --git a/crates/modules/semantic_dht/minga-core/src/alpha/go.rs b/crates/modules/semantic_dht/minga-core/src/alpha/go.rs new file mode 100644 index 0000000..396a61c --- /dev/null +++ b/crates/modules/semantic_dht/minga-core/src/alpha/go.rs @@ -0,0 +1,283 @@ +//! α-hashing per-language para Go. +//! +//! Cobertura: +//! - **`function_declaration`**, **`method_declaration`**, +//! **`func_literal`** (closure): `parameter_list` introduce +//! binder(es) al `body`. +//! - **`parameter_declaration`**: puede agrupar varios names con un +//! tipo (`a, b int`). Cada `name` es binder; `type` viaja como +//! referencia. +//! - **`block`**: `short_var_declaration` (`x := ...`) introduce +//! binders al resto del block. +//! - **`for_statement`** con **`range_clause`** (`for k, v := range m`): +//! los identifiers del `left` son binders al `body`. +//! - **`for_statement`** con **`for_clause`** (C-style `for i := 0; i < n; i++`): +//! el `initializer` (short_var_declaration) introduce binders al +//! condition + update + body. +//! - **`if_statement`** con **`initializer`**: binders del +//! short_var_declaration viven en condition + consequence + alternative. +//! +//! Pendientes (scope acotado): +//! - `var_declaration` (`var x = ...`) tratado como literal por +//! ahora; introduce binder al scope envolvente igual que +//! short_var_declaration pero distinto kind. +//! - `type_switch_statement` con assertion binding. +//! - `select` statements con send/receive binding. + +use crate::alpha::common::{ + emit_binder_body, emit_identifier_ref, emit_leaf_marker, push_identifier_name, + write_kind_and_field, TAG_NO_LEAF, +}; +use crate::ast::SemanticNode; +use crate::cas::ContentHash; +use blake3::Hasher; + +pub fn hash_node_alpha_go(node: &SemanticNode) -> ContentHash { + let mut h = Hasher::new(); + let mut scope: Vec = Vec::new(); + feed(&mut h, node, &mut scope); + ContentHash(*h.finalize().as_bytes()) +} + +fn feed(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + match node.kind.as_str() { + "function_declaration" | "method_declaration" | "func_literal" => { + feed_callable(h, node, scope) + } + "block" => feed_block(h, node, scope), + "for_statement" => feed_for_statement(h, node, scope), + "if_statement" => feed_if_statement(h, node, scope), + // Dispatcheados también fuera de block/for/if para que sus + // identifiers se emitan como binders cuando aparecen en + // contextos como range_clause o initializer de if/for. + "short_var_declaration" => feed_short_var_decl(h, node, scope), + "range_clause" => feed_range_clause(h, node, scope), + "identifier" => emit_identifier_ref(h, node, scope), + _ => feed_default(h, node, scope), + } +} + +/// `for k, v := range m` — el `left` (expression_list) tiene +/// identifiers que son binders. El `right` se evalúa como referencia +/// normal (es la fuente de iteración). +fn feed_range_clause(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.field_name.as_deref() == Some("left") { + feed_short_var_left(h, c); + } else { + feed(h, c, scope); + } + } +} + +fn feed_default(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + emit_leaf_marker(h, node); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + feed(h, c, scope); + } +} + +fn feed_callable(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("parameters") { + collect_parameter_list_binders(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("parameters") => feed_parameter_list(h, c, scope), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +fn feed_parameter_list(h: &mut Hasher, params: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, params); + h.update(&[TAG_NO_LEAF]); + h.update(&(params.children.len() as u64).to_le_bytes()); + for c in ¶ms.children { + if c.kind == "parameter_declaration" { + feed_parameter_declaration(h, c, scope); + } else { + feed(h, c, scope); + } + } +} + +/// `a, b int` — todos los `name=identifier` son binders; `type` +/// viaja como referencia normal (puede mencionar tipos importados). +fn feed_parameter_declaration(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.field_name.as_deref() == Some("name") && c.kind == "identifier" { + emit_named_binder(h, c); + } else { + feed(h, c, scope); + } + } +} + +fn collect_parameter_list_binders(params: &SemanticNode, out: &mut Vec) { + for c in ¶ms.children { + if c.kind == "parameter_declaration" { + for cc in &c.children { + if cc.field_name.as_deref() == Some("name") && cc.kind == "identifier" { + push_identifier_name(cc, out); + } + } + } + } +} + +/// Block: `short_var_declaration` introduce binders al resto. +fn feed_block(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let scope_before = scope.len(); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.kind == "short_var_declaration" { + feed_short_var_decl(h, c, scope); + collect_short_var_binders(c, scope); + } else { + feed(h, c, scope); + } + } + scope.truncate(scope_before); +} + +fn feed_short_var_decl(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.field_name.as_deref() == Some("left") { + feed_short_var_left(h, c); + } else { + feed(h, c, scope); + } + } +} + +fn feed_short_var_left(h: &mut Hasher, node: &SemanticNode) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.kind == "identifier" { + emit_named_binder(h, c); + } else { + // separadores ',' y otros tokens — emit literal. + emit_leaf_marker(h, c); + h.update(&(c.children.len() as u64).to_le_bytes()); + } + } +} + +fn collect_short_var_binders(node: &SemanticNode, out: &mut Vec) { + for c in &node.children { + if c.field_name.as_deref() == Some("left") { + for cc in &c.children { + if cc.kind == "identifier" { + push_identifier_name(cc, out); + } + } + } + } +} + +/// `for k, v := range m { body }` o `for i := 0; i < n; i++ { body }`. +fn feed_for_statement(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + match c.kind.as_str() { + "range_clause" => { + for cc in &c.children { + if cc.field_name.as_deref() == Some("left") { + for ccc in &cc.children { + if ccc.kind == "identifier" { + push_identifier_name(ccc, &mut binders); + } + } + } + } + } + "for_clause" => { + for cc in &c.children { + if cc.field_name.as_deref() == Some("initializer") + && cc.kind == "short_var_declaration" + { + collect_short_var_binders(cc, &mut binders); + } + } + } + _ => {} + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +/// `if x := init(); cond { ... } else { ... }`. El initializer +/// introduce binders que viven en condition + consequence + +/// alternative. +fn feed_if_statement(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("initializer") + && c.kind == "short_var_declaration" + { + collect_short_var_binders(c, &mut binders); + } + } + + let scope_before = scope.len(); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("initializer") => { + feed(h, c, scope); + scope.extend(binders.iter().cloned()); + } + _ => feed(h, c, scope), + } + } + scope.truncate(scope_before); +} + +fn emit_named_binder(h: &mut Hasher, node: &SemanticNode) { + write_kind_and_field(h, node); + emit_binder_body(h); +} diff --git a/crates/modules/semantic_dht/minga-core/src/alpha/mod.rs b/crates/modules/semantic_dht/minga-core/src/alpha/mod.rs new file mode 100644 index 0000000..3df7128 --- /dev/null +++ b/crates/modules/semantic_dht/minga-core/src/alpha/mod.rs @@ -0,0 +1,43 @@ +//! Hash α-equivalente per-language. +//! +//! Cada dialecto soportado por [`crate::parse`] tiene su propio +//! profile en este módulo. Todos comparten primitives de wire en +//! [`common`] para garantizar comparabilidad bit-a-bit del hash +//! entre lenguajes con la misma estructura semántica. +//! +//! ## API +//! +//! - [`hash_node_alpha`] — alias histórico. Asume Rust. Mantenido +//! por compat con callers viejos (`alpha::hash_node_alpha` sigue +//! apuntando a Rust). +//! - [`hash_alpha_with`] — toma [`crate::parse::Dialect`] y delega +//! al profile correspondiente. + +pub mod common; +pub mod ecmascript; +pub mod go; +pub mod python; +pub mod rust; + +pub use rust::hash_node_alpha; + +use crate::ast::SemanticNode; +use crate::cas::ContentHash; +use crate::parse::Dialect; + +/// Calcula el hash α-equivalente de `node` usando el profile del +/// `dialect`. Cada profile entiende los binders propios de su +/// lenguaje (def/lambda/comprehensions en Python, function/arrow en +/// JS/TS, func/range en Go, etc.). +/// +/// Para callers que ya saben que están en Rust, [`hash_node_alpha`] +/// es atajo equivalente. +pub fn hash_alpha_with(dialect: Dialect, node: &SemanticNode) -> ContentHash { + match dialect { + Dialect::Rust => rust::hash_node_alpha(node), + Dialect::Python => python::hash_node_alpha_python(node), + Dialect::TypeScript => ecmascript::hash_node_alpha_ecmascript(node), + Dialect::JavaScript => ecmascript::hash_node_alpha_ecmascript(node), + Dialect::Go => go::hash_node_alpha_go(node), + } +} diff --git a/crates/modules/semantic_dht/minga-core/src/alpha/python.rs b/crates/modules/semantic_dht/minga-core/src/alpha/python.rs new file mode 100644 index 0000000..7e5f713 --- /dev/null +++ b/crates/modules/semantic_dht/minga-core/src/alpha/python.rs @@ -0,0 +1,387 @@ +//! α-hashing per-language para Python. +//! +//! Cobertura: +//! - **`function_definition`** y **`lambda`**: parámetros introducen +//! binders al body. Soporta defaults (`def f(x=1)`) y type hints +//! (`def f(x: int)`) — el binder es el identifier; el default y el +//! type viajan como expresiones referenciables al scope previo. +//! - **`for_statement`**: el `left` (identifier o tuple_pattern) +//! introduce binder(es) al `body`. +//! - **Comprehensions**: `list_comprehension`, `set_comprehension`, +//! `dictionary_comprehension`, `generator_expression`. Cada +//! `for_in_clause` introduce binder(es) que viven en el `body` + +//! `if_clause`s + `for_in_clause`s siguientes (semántica de scope +//! incremental de Python). +//! - **`with_statement`**: `with X() as y:` introduce `y` al body. +//! +//! Python NO distingue binders por capitalización (a diferencia de +//! Rust con `Some` vs `x`). En posición de parámetro/for-target, +//! todo identifier es binder. +//! +//! Pendientes (no cubiertos hoy, scope acotado): +//! - `class_definition` y métodos (`self` no es binder explícito en +//! la firma; el primer parámetro recibe nombre arbitrario). +//! - `assignment` como introductor de scope (Python no tiene `let` +//! explícito; un `x = 1` agrega x al scope global o local del +//! bloque envolvente — manejarlo bien requiere análisis de scope +//! que va más allá del α-hashing tradicional). +//! - Nested defaults, walrus operator (`:=`), starred patterns. + +use crate::alpha::common::{ + emit_binder_body, emit_identifier_ref, emit_leaf_marker, push_identifier_name, + write_kind_and_field, TAG_NO_LEAF, +}; +use crate::ast::SemanticNode; +use crate::cas::ContentHash; +use blake3::Hasher; + +pub fn hash_node_alpha_python(node: &SemanticNode) -> ContentHash { + let mut h = Hasher::new(); + let mut scope: Vec = Vec::new(); + feed(&mut h, node, &mut scope); + ContentHash(*h.finalize().as_bytes()) +} + +fn feed(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + match node.kind.as_str() { + "function_definition" => feed_function_definition(h, node, scope), + "lambda" => feed_lambda(h, node, scope), + "for_statement" => feed_for_statement(h, node, scope), + "list_comprehension" + | "set_comprehension" + | "dictionary_comprehension" + | "generator_expression" => feed_comprehension(h, node, scope), + "with_statement" => feed_with_statement(h, node, scope), + // Cuando un as_pattern_target aparece (típicamente dentro de + // un with_clause), sus identifiers son binders. El scope ya + // se extendió en feed_with_statement antes de llegar al body; + // pero el target mismo necesita emitir binders anónimos para + // que el hash no varíe con el nombre. + "as_pattern_target" => feed_target_as_binders(h, node), + "identifier" => emit_identifier_ref(h, node, scope), + _ => feed_default(h, node, scope), + } +} + +fn feed_default(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + emit_leaf_marker(h, node); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + feed(h, c, scope); + } +} + +/// `def f(x, y=1, z: int): body` → params son binders al body. +/// El `name` (identifier de la función) se trata como literal — no +/// es un binder local (es publicado al scope envolvente, no manejado +/// acá). +fn feed_function_definition(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("parameters") { + collect_param_binders(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("parameters") => feed_params(h, c, scope), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + Some("name") => { + // Nombre de la función: viaja como literal (afecta el + // hash, no es α-anónimo). Mismo tratamiento que en + // Rust con `function_item.name`. + feed_as_literal(h, c); + } + _ => feed(h, c, scope), + } + } +} + +/// `lambda x, y: body` — params binders al body. +fn feed_lambda(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("parameters") { + collect_param_binders(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("parameters") => feed_params(h, c, scope), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +/// `for x in iterable: body` — x es binder al body. +fn feed_for_statement(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.field_name.as_deref() == Some("left") { + collect_target_binders(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("left") => feed_target_as_binders(h, c), + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +/// `[expr for x in xs if cond]` — los `for_in_clause` y `if_clause` +/// se procesan en orden: cada `for_in_clause` añade binders que +/// viven en lo siguiente. El `body` (la expresión final) ve TODOS +/// los binders acumulados. +fn feed_comprehension(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + // Recolectamos TODOS los binders de TODAS las for_in_clauses. + // Python evalúa la comprehension de izquierda a derecha pero el + // body ve todo; α-hashing colapsa eso a "todos visibles en body". + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.kind == "for_in_clause" { + for cc in &c.children { + if cc.field_name.as_deref() == Some("left") { + collect_target_binders(cc, &mut binders); + } + } + } + } + + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.kind == "for_in_clause" { + feed_for_in_clause(h, c, scope); + } else { + feed(h, c, scope); + } + } + + scope.truncate(scope_before); +} + +/// `for x in xs` dentro de una comprehension. El `left` es binder +/// (anónimo); el `right` se evalúa en el scope previo (sin x). +/// Pero como `feed_comprehension` ya extendió el scope antes de +/// llamarnos, x sí está en scope para el right de un `for X in expr` +/// posterior — semántica correcta de comprehensions de Python. +fn feed_for_in_clause(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.field_name.as_deref() == Some("left") { + feed_target_as_binders(h, c); + } else { + feed(h, c, scope); + } + } +} + +/// `with X() as y, Z() as w: body` — los `as` introducen binders al body. +fn feed_with_statement(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + h.update(&[TAG_NO_LEAF]); + + let mut binders: Vec = Vec::new(); + for c in &node.children { + if c.kind == "with_clause" { + collect_with_clause_binders(c, &mut binders); + } + } + + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + match c.field_name.as_deref() { + Some("body") => { + let scope_before = scope.len(); + scope.extend(binders.iter().cloned()); + feed(h, c, scope); + scope.truncate(scope_before); + } + _ => feed(h, c, scope), + } + } +} + +fn collect_with_clause_binders(node: &SemanticNode, out: &mut Vec) { + // En tree-sitter-python, with_item.value puede ser un as_pattern + // que tiene su propio alias. Recursamos para encontrar cualquier + // as_pattern_target en el subárbol. + for c in &node.children { + if c.kind == "with_item" { + collect_as_pattern_targets(c, out); + } + } +} + +fn collect_as_pattern_targets(node: &SemanticNode, out: &mut Vec) { + if node.kind == "as_pattern_target" { + collect_target_binders(node, out); + return; + } + for c in &node.children { + collect_as_pattern_targets(c, out); + } +} + +/// Los parameters de def/lambda se procesan emitiendo cada +/// identifier como binder anónimo. Defaults / type hints / *args / +/// **kwargs se preservan literalmente (afectan el hash). +fn feed_params(h: &mut Hasher, params: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, params); + h.update(&[TAG_NO_LEAF]); + h.update(&(params.children.len() as u64).to_le_bytes()); + for c in ¶ms.children { + match c.kind.as_str() { + "identifier" => emit_param_binder(h, c), + "typed_parameter" | "default_parameter" | "typed_default_parameter" => { + feed_complex_param(h, c, scope); + } + "list_splat_pattern" | "dictionary_splat_pattern" => { + // *args, **kwargs: el binder es el identifier interno. + feed_splat_param(h, c); + } + _ => feed(h, c, scope), + } + } +} + +fn emit_param_binder(h: &mut Hasher, ident: &SemanticNode) { + write_kind_and_field(h, ident); + emit_binder_body(h); +} + +/// `x: int`, `x = 1`, `x: int = 1` — el primer identifier es binder; +/// el resto (type, default) son referenciables. +fn feed_complex_param(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + let mut named_binder = false; + for c in &node.children { + if !named_binder && c.kind == "identifier" { + emit_param_binder(h, c); + named_binder = true; + } else { + feed(h, c, scope); + } + } +} + +fn feed_splat_param(h: &mut Hasher, node: &SemanticNode) { + write_kind_and_field(h, node); + h.update(&[TAG_NO_LEAF]); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + if c.kind == "identifier" { + emit_param_binder(h, c); + } else { + feed_as_literal(h, c); + } + } +} + +fn collect_param_binders(params: &SemanticNode, out: &mut Vec) { + for c in ¶ms.children { + match c.kind.as_str() { + "identifier" => push_identifier_name(c, out), + "typed_parameter" | "default_parameter" | "typed_default_parameter" => { + if let Some(ident) = c.children.iter().find(|cc| cc.kind == "identifier") { + push_identifier_name(ident, out); + } + } + "list_splat_pattern" | "dictionary_splat_pattern" => { + if let Some(ident) = c.children.iter().find(|cc| cc.kind == "identifier") { + push_identifier_name(ident, out); + } + } + _ => {} + } + } +} + +/// El `left` de `for x in xs:` o de `with X as y:` puede ser un +/// identifier solo o una tupla destructurada (`for k, v in ...`). +fn collect_target_binders(target: &SemanticNode, out: &mut Vec) { + match target.kind.as_str() { + "identifier" => push_identifier_name(target, out), + "tuple_pattern" | "pattern_list" | "list_pattern" => { + for c in &target.children { + collect_target_binders(c, out); + } + } + _ => { + // Recursamos por si hay subnodos relevantes (e.g. parens). + for c in &target.children { + collect_target_binders(c, out); + } + } + } +} + +/// Emit del target como binders anónimos. Mismo recorrido que collect. +fn feed_target_as_binders(h: &mut Hasher, target: &SemanticNode) { + write_kind_and_field(h, target); + match target.kind.as_str() { + "identifier" => emit_binder_body(h), + "tuple_pattern" | "pattern_list" | "list_pattern" => { + h.update(&[TAG_NO_LEAF]); + h.update(&(target.children.len() as u64).to_le_bytes()); + for c in &target.children { + feed_target_as_binders(h, c); + } + } + _ => { + // Fallback: literal (preserva la estructura textual). + emit_leaf_marker(h, target); + h.update(&(target.children.len() as u64).to_le_bytes()); + for c in &target.children { + feed_target_as_binders(h, c); + } + } + } +} + +fn feed_as_literal(h: &mut Hasher, node: &SemanticNode) { + write_kind_and_field(h, node); + emit_leaf_marker(h, node); + h.update(&(node.children.len() as u64).to_le_bytes()); + for c in &node.children { + feed_as_literal(h, c); + } +} diff --git a/crates/modules/semantic_dht/minga-core/src/alpha.rs b/crates/modules/semantic_dht/minga-core/src/alpha/rust.rs similarity index 89% rename from crates/modules/semantic_dht/minga-core/src/alpha.rs rename to crates/modules/semantic_dht/minga-core/src/alpha/rust.rs index 3acf763..5e39f18 100644 --- a/crates/modules/semantic_dht/minga-core/src/alpha.rs +++ b/crates/modules/semantic_dht/minga-core/src/alpha/rust.rs @@ -42,16 +42,14 @@ //! enforcement); recolectamos sólo del primer alternativo para //! evitar duplicados, emitimos feed_pattern para cada uno. +use crate::alpha::common::{ + emit_binder_body, emit_binder_node, emit_identifier_ref, emit_leaf_marker, + push_identifier_name, write_kind_and_field, TAG_NO_LEAF, +}; use crate::ast::SemanticNode; use crate::cas::ContentHash; use blake3::Hasher; -const TAG_NO_LEAF: u8 = 0; -const TAG_LEAF: u8 = 1; -const TAG_BINDER: u8 = 2; -const TAG_REF_BOUND: u8 = 3; -const TAG_REF_FREE: u8 = 4; - pub fn hash_node_alpha(node: &SemanticNode) -> ContentHash { let mut h = Hasher::new(); let mut scope: Vec = Vec::new(); @@ -171,55 +169,6 @@ fn feed_default(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { } } -fn emit_identifier_ref(h: &mut Hasher, node: &SemanticNode, scope: &Vec) { - h.update(&[TAG_NO_LEAF]); - if let Some(t) = &node.leaf_text { - if let Ok(name) = std::str::from_utf8(t) { - if let Some(i) = scope.iter().rposition(|n| n == name) { - let de_bruijn = (scope.len() - 1 - i) as u64; - h.update(&[TAG_REF_BOUND]); - h.update(&de_bruijn.to_le_bytes()); - } else { - h.update(&[TAG_REF_FREE]); - h.update(&(t.len() as u64).to_le_bytes()); - h.update(t); - } - } else { - h.update(&[TAG_REF_FREE]); - h.update(&(t.len() as u64).to_le_bytes()); - h.update(t); - } - } else { - h.update(&[TAG_REF_FREE]); - h.update(&[0u8; 8]); - } - h.update(&[0u8; 8]); -} - -fn emit_binder_body(h: &mut Hasher) { - h.update(&[TAG_NO_LEAF]); - h.update(&[TAG_BINDER]); - h.update(&[0u8; 8]); -} - -fn emit_binder_node(h: &mut Hasher, node: &SemanticNode) { - write_kind_and_field(h, node); - emit_binder_body(h); -} - -fn emit_leaf_marker(h: &mut Hasher, node: &SemanticNode) { - match &node.leaf_text { - Some(t) => { - h.update(&[TAG_LEAF]); - h.update(&(t.len() as u64).to_le_bytes()); - h.update(t); - } - None => { - h.update(&[TAG_NO_LEAF]); - } - } -} - fn feed_callable(h: &mut Hasher, node: &SemanticNode, scope: &mut Vec) { h.update(&[TAG_NO_LEAF]); @@ -585,16 +534,8 @@ fn collect_field_pattern_binders(fp: &SemanticNode, out: &mut Vec) { } } -fn push_identifier_name(node: &SemanticNode, out: &mut Vec) { - if let Some(t) = &node.leaf_text { - if let Ok(s) = std::str::from_utf8(t) { - out.push(s.to_string()); - } - } -} - /// Determina si un `identifier` en posición de patrón se interpreta como -/// binder. Reglas: +/// binder. Reglas (específicas de Rust): /// - Si tiene `field_name == "pattern"` (parámetros, lets), siempre es binder. /// - Si su nombre comienza con minúscula, es binder. /// - Si comienza con `_` seguido de letra/dígito, es binder (convención @@ -619,21 +560,3 @@ fn is_binder_name(s: &str) -> bool { None => false, } } - -fn write_kind_and_field(h: &mut Hasher, node: &SemanticNode) { - write_str(h, &node.kind); - match &node.field_name { - Some(f) => { - h.update(&[1]); - write_str(h, f); - } - None => { - h.update(&[0]); - } - } -} - -fn write_str(h: &mut Hasher, s: &str) { - h.update(&(s.len() as u64).to_le_bytes()); - h.update(s.as_bytes()); -} diff --git a/crates/modules/semantic_dht/minga-core/tests/alpha_polyglot.rs b/crates/modules/semantic_dht/minga-core/tests/alpha_polyglot.rs new file mode 100644 index 0000000..c39d6cc --- /dev/null +++ b/crates/modules/semantic_dht/minga-core/tests/alpha_polyglot.rs @@ -0,0 +1,307 @@ +//! α-equivalencia para Python, TypeScript, JavaScript, Go. +//! +//! Mismas propiedades que `alpha_invariants.rs` para Rust: +//! - Renombre de variables ligadas → mismo hash. +//! - Cambio de estructura / nombres libres → hash distinto. + +use minga_core::alpha::hash_alpha_with; +use minga_core::parse::Dialect; + +fn h(d: Dialect, src: &str) -> minga_core::cas::ContentHash { + let n = d.parse(src).expect("parse OK"); + hash_alpha_with(d, &n) +} + +// ============================================================================ +// Python +// ============================================================================ + +#[test] +fn python_def_param_rename_invariant() { + let a = h(Dialect::Python, "def f(x):\n return x + 1\n"); + let b = h(Dialect::Python, "def f(y):\n return y + 1\n"); + assert_eq!(a, b); +} + +#[test] +fn python_def_function_name_matters() { + let a = h(Dialect::Python, "def f(x):\n return x\n"); + let b = h(Dialect::Python, "def g(x):\n return x\n"); + assert_ne!(a, b, "el nombre de la función NO es α-anónimo"); +} + +#[test] +fn python_lambda_rename_invariant() { + let a = h(Dialect::Python, "f = lambda x: x + 1\n"); + let b = h(Dialect::Python, "f = lambda y: y + 1\n"); + assert_eq!(a, b); +} + +#[test] +fn python_for_loop_rename_invariant() { + let a = h( + Dialect::Python, + "for x in xs:\n print(x)\n", + ); + let b = h( + Dialect::Python, + "for y in xs:\n print(y)\n", + ); + assert_eq!(a, b); +} + +#[test] +fn python_for_iterable_name_matters() { + let a = h( + Dialect::Python, + "for x in xs:\n print(x)\n", + ); + let b = h( + Dialect::Python, + "for x in ys:\n print(x)\n", + ); + assert_ne!(a, b, "el iterable es variable libre, su nombre importa"); +} + +#[test] +fn python_list_comprehension_rename_invariant() { + let a = h(Dialect::Python, "result = [x*2 for x in xs]\n"); + let b = h(Dialect::Python, "result = [y*2 for y in xs]\n"); + assert_eq!(a, b); +} + +#[test] +fn python_nested_comprehension_rename_invariant() { + // Doble for_in_clause: x e y son binders. + let a = h( + Dialect::Python, + "result = [(x, y) for x in xs for y in ys]\n", + ); + let b = h( + Dialect::Python, + "result = [(a, b) for a in xs for b in ys]\n", + ); + assert_eq!(a, b); +} + +#[test] +fn python_with_statement_rename_invariant() { + let a = h( + Dialect::Python, + "with open(p) as f:\n f.read()\n", + ); + let b = h( + Dialect::Python, + "with open(p) as g:\n g.read()\n", + ); + assert_eq!(a, b); +} + +#[test] +fn python_lambda_does_not_collide_with_unrelated() { + let plus = h(Dialect::Python, "f = lambda x: x + 1\n"); + let minus = h(Dialect::Python, "f = lambda x: x - 1\n"); + assert_ne!(plus, minus, "operación distinta debe dar hash distinto"); +} + +// ============================================================================ +// JavaScript / TypeScript (mismo profile) +// ============================================================================ + +#[test] +fn js_function_rename_invariant() { + let a = h(Dialect::JavaScript, "function f(x) { return x + 1; }"); + let b = h(Dialect::JavaScript, "function f(y) { return y + 1; }"); + assert_eq!(a, b); +} + +#[test] +fn js_function_name_matters() { + let a = h(Dialect::JavaScript, "function f(x) { return x; }"); + let b = h(Dialect::JavaScript, "function g(x) { return x; }"); + assert_ne!(a, b); +} + +#[test] +fn js_arrow_function_rename_invariant() { + let a = h(Dialect::JavaScript, "const f = (x) => x + 1;"); + let b = h(Dialect::JavaScript, "const f = (y) => y + 1;"); + assert_eq!(a, b); +} + +#[test] +fn js_arrow_shorthand_rename_invariant() { + // `x => ...` (sin paréntesis) — single identifier. + let a = h(Dialect::JavaScript, "const f = x => x + 1;"); + let b = h(Dialect::JavaScript, "const f = y => y + 1;"); + assert_eq!(a, b); +} + +#[test] +fn js_let_const_rename_invariant() { + let a = h(Dialect::JavaScript, "function f() { const x = 1; return x + 2; }"); + let b = h(Dialect::JavaScript, "function f() { const y = 1; return y + 2; }"); + assert_eq!(a, b); +} + +#[test] +fn js_for_of_rename_invariant() { + let a = h( + Dialect::JavaScript, + "function f() { for (const x of xs) { use(x); } }", + ); + let b = h( + Dialect::JavaScript, + "function f() { for (const y of xs) { use(y); } }", + ); + assert_eq!(a, b); +} + +#[test] +fn js_for_classic_rename_invariant() { + let a = h( + Dialect::JavaScript, + "function f() { for (let i = 0; i < n; i++) { use(i); } }", + ); + let b = h( + Dialect::JavaScript, + "function f() { for (let j = 0; j < n; j++) { use(j); } }", + ); + assert_eq!(a, b); +} + +#[test] +fn js_catch_rename_invariant() { + let a = h( + Dialect::JavaScript, + "function f() { try { x(); } catch (e) { log(e); } }", + ); + let b = h( + Dialect::JavaScript, + "function f() { try { x(); } catch (err) { log(err); } }", + ); + assert_eq!(a, b); +} + +#[test] +fn ts_typed_param_rename_invariant() { + // El TIPO afecta el hash, pero el nombre del parámetro no. + let a = h( + Dialect::TypeScript, + "function f(x: number): number { return x + 1; }", + ); + let b = h( + Dialect::TypeScript, + "function f(y: number): number { return y + 1; }", + ); + assert_eq!(a, b); +} + +#[test] +fn ts_typed_param_type_matters() { + let int_v = h( + Dialect::TypeScript, + "function f(x: number): number { return x; }", + ); + let str_v = h( + Dialect::TypeScript, + "function f(x: string): string { return x; }", + ); + assert_ne!(int_v, str_v, "el tipo afecta semántica"); +} + +// ============================================================================ +// Go +// ============================================================================ + +#[test] +fn go_function_rename_invariant() { + let a = h( + Dialect::Go, + "package main\nfunc add(a, b int) int { return a + b }\n", + ); + let b = h( + Dialect::Go, + "package main\nfunc add(x, y int) int { return x + y }\n", + ); + assert_eq!(a, b); +} + +#[test] +fn go_function_name_matters() { + let a = h( + Dialect::Go, + "package main\nfunc add(a, b int) int { return a + b }\n", + ); + let b = h( + Dialect::Go, + "package main\nfunc sub(a, b int) int { return a + b }\n", + ); + assert_ne!(a, b); +} + +#[test] +fn go_short_var_decl_rename_invariant() { + let a = h( + Dialect::Go, + "package main\nfunc main() { x := compute(); use(x) }\n", + ); + let b = h( + Dialect::Go, + "package main\nfunc main() { y := compute(); use(y) }\n", + ); + assert_eq!(a, b); +} + +#[test] +fn go_range_clause_rename_invariant() { + let a = h( + Dialect::Go, + "package main\nfunc main() { for k, v := range m { use(k, v) } }\n", + ); + let b = h( + Dialect::Go, + "package main\nfunc main() { for x, y := range m { use(x, y) } }\n", + ); + assert_eq!(a, b); +} + +#[test] +fn go_if_init_rename_invariant() { + let a = h( + Dialect::Go, + "package main\nfunc main() { if x := lookup(); x > 0 { use(x) } }\n", + ); + let b = h( + Dialect::Go, + "package main\nfunc main() { if y := lookup(); y > 0 { use(y) } }\n", + ); + assert_eq!(a, b); +} + +#[test] +fn go_func_literal_closure_rename_invariant() { + let a = h( + Dialect::Go, + "package main\nvar f = func(x int) int { return x + 1 }\n", + ); + let b = h( + Dialect::Go, + "package main\nvar f = func(y int) int { return y + 1 }\n", + ); + assert_eq!(a, b); +} + +// ============================================================================ +// Cross-language sanity +// ============================================================================ + +#[test] +fn structurally_similar_programs_in_different_languages_have_distinct_hashes() { + // `def f(x): return x+1` en Python vs `function f(x){return x+1}` en JS. + // Mismo "shape" en idea pero distintas gramáticas → distintos kinds → + // distintos hashes. Importante para evitar colisiones cross-language. + let py = h(Dialect::Python, "def f(x):\n return x + 1\n"); + let js = h(Dialect::JavaScript, "function f(x) { return x + 1; }"); + assert_ne!(py, js); +}