feat(sandokan-lifecycle): A4 — primitivas de lifecycle agnósticas

Nuevo crate runtime/sandokan-lifecycle: lógica pura reutilizable por cualquier supervisor de procesos (shuma, matilda Ghost, charka-shadow, mirada). Sin syscalls, sin proceso, sin UI. Módulos: - backoff — Backoff exponencial con tope - ttl — Ttl anclado a Instant - quota — ResourceQuota + check_quota + Breach + QuotaAction - restart — RestartPolicy + RestartTracker (conteo + backoff) - state — LifecycleState (Pending/Running/Exited/Failed/Killed) 15 tests verdes. cargo check --workspace verde. Variante segura de A4: se crea la library limpia sin tocar shuma-core (módulo maduro). La migración de WorkspaceManager a consumir estas primitivas queda registrada como A4.2 (refactor diferido, no urgente). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 00:32:52 +00:00
parent 67c0fcad11
commit 545dd59c72
10 changed files with 487 additions and 0 deletions
@@ -10127,6 +10127,13 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "sandokan-lifecycle"
+version = "0.1.0"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "saphyr-parser"
 version = "0.0.6"
@@ -34,6 +34,7 @@ members = [
    "crates/runtime/arje-brain-audit",
    "crates/runtime/arje-brain",
    "crates/runtime/arje-echo",
+    "crates/runtime/sandokan-lifecycle",

    # ============================================================
    # compat/ — Shims D-Bus para correr software systemd-aware
@@ -16,6 +16,22 @@ rule engine + audit log, y un ente de smoke test.
 | `arje-brain-audit`     | lib  | Audit chain con hashes anclados al CAS             |
 | `arje-brain`           | lib  | Integración: introspect + autopromote + metrics    |
 | `arje-echo`            | bin  | Ente prueba — provee `Capability::Endpoint(echo)`  |
+| `sandokan-lifecycle`   | lib  | Primitivas de lifecycle agnósticas (ver abajo)     |
+
+## sandokan-lifecycle
+
+Library de primitivas de ciclo de vida agnósticas (sin syscalls, sin
+proceso, sin UI — solo cálculo). Consumible por cualquier supervisor:
+`shuma`, `matilda` Ghost, `charka-shadow`, `mirada`.
+
+- `Backoff` — backoff exponencial con tope.
+- `Ttl` — time-to-live anclado a `Instant`.
+- `ResourceQuota` + `check_quota` — cuotas de recursos + breaches.
+- `RestartPolicy` + `RestartTracker` — restart con conteo + backoff.
+- `LifecycleState` — máquina de estados (Pending/Running/Exited/Failed/Killed).
+
+15 tests verdes. **Pendiente (A4.2)**: migrar `shuma-core::WorkspaceManager`
+para que consuma estas primitivas en lugar de su implementación inline.

 ## Dependencias

@@ -0,0 +1,11 @@
+[package]
+name = "sandokan-lifecycle"
+description = "Primitivas de ciclo de vida agnósticas: backoff exponencial, TTL, cuotas de recursos, política de restart, estado. Sin deps de proceso ni de UI."
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+authors.workspace = true
+publish.workspace = true
+
+[dependencies]
+serde = { workspace = true }
@@ -0,0 +1,68 @@
+//! Backoff exponencial con tope.
+
+use std::time::Duration;
+
+/// Calculador de backoff exponencial. Cada `next_delay()` devuelve el
+/// delay actual y luego lo duplica, hasta saturar en `max`.
+#[derive(Debug, Clone)]
+pub struct Backoff {
+    base: Duration,
+    max: Duration,
+    current: Duration,
+}
+
+impl Backoff {
+    /// Crea un backoff que arranca en `base` y satura en `max`.
+    /// Si `base > max`, `base` se clampa a `max`.
+    pub fn new(base: Duration, max: Duration) -> Self {
+        let base = base.min(max);
+        Self { base, max, current: base }
+    }
+
+    /// Devuelve el delay actual y escala el siguiente (×2, capeado a `max`).
+    pub fn next_delay(&mut self) -> Duration {
+        let delay = self.current;
+        self.current = (self.current * 2).min(self.max);
+        delay
+    }
+
+    /// Vuelve al delay base (tras un éxito).
+    pub fn reset(&mut self) {
+        self.current = self.base;
+    }
+
+    /// Delay que devolvería el próximo `next_delay()` sin consumirlo.
+    pub fn peek(&self) -> Duration {
+        self.current
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn escalates_then_caps() {
+        let mut b = Backoff::new(Duration::from_millis(100), Duration::from_millis(800));
+        assert_eq!(b.next_delay(), Duration::from_millis(100));
+        assert_eq!(b.next_delay(), Duration::from_millis(200));
+        assert_eq!(b.next_delay(), Duration::from_millis(400));
+        assert_eq!(b.next_delay(), Duration::from_millis(800));
+        assert_eq!(b.next_delay(), Duration::from_millis(800)); // capeado
+    }
+
+    #[test]
+    fn reset_returns_to_base() {
+        let mut b = Backoff::new(Duration::from_millis(100), Duration::from_secs(30));
+        b.next_delay();
+        b.next_delay();
+        b.reset();
+        assert_eq!(b.next_delay(), Duration::from_millis(100));
+    }
+
+    #[test]
+    fn base_clamped_to_max() {
+        let mut b = Backoff::new(Duration::from_secs(10), Duration::from_secs(1));
+        assert_eq!(b.next_delay(), Duration::from_secs(1));
+    }
+}
@@ -0,0 +1,23 @@
+//! sandokan-lifecycle — primitivas de ciclo de vida agnósticas.
+//!
+//! Lógica pura reutilizable por cualquier supervisor de procesos
+//! (shuma, matilda Ghost, charka-shadow, mirada). Sin dependencias de
+//! syscalls, proceso, ni UI: solo cálculo.
+//!
+//! - [`backoff`] — backoff exponencial con tope.
+//! - [`ttl`]     — time-to-live anclado a un `Instant`.
+//! - [`quota`]   — cuotas de recursos + chequeo de breaches.
+//! - [`restart`] — política de restart con conteo + backoff.
+//! - [`state`]   — máquina de estados del ciclo de vida.
+
+pub mod backoff;
+pub mod ttl;
+pub mod quota;
+pub mod restart;
+pub mod state;
+
+pub use backoff::Backoff;
+pub use ttl::Ttl;
+pub use quota::{Breach, QuotaAction, QuotaReport, ResourceQuota, ResourceUsage, check_quota};
+pub use restart::{RestartPolicy, RestartTracker};
+pub use state::LifecycleState;
@@ -0,0 +1,124 @@
+//! Cuotas de recursos + chequeo de breaches.
+
+use serde::{Deserialize, Serialize};
+
+/// Acción a tomar cuando una cuota se excede.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
+pub enum QuotaAction {
+    /// No hacer nada (accounting puro).
+    #[default]
+    None,
+    /// Sólo loggear el breach.
+    Log,
+    /// Terminar la entidad supervisada.
+    Kill,
+}
+
+/// Límites declarativos de recursos. `None` = sin límite para ese recurso.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct ResourceQuota {
+    pub mem_bytes: Option<u64>,
+    pub nproc: Option<u32>,
+    /// Porcentaje de CPU (100.0 = 1 core saturado).
+    pub cpu_pct: Option<f64>,
+}
+
+/// Uso de recursos medido en un instante.
+#[derive(Debug, Clone, Default)]
+pub struct ResourceUsage {
+    pub mem_bytes: u64,
+    pub nproc: u32,
+    pub cpu_pct: f64,
+}
+
+/// Un recurso que excedió su límite.
+#[derive(Debug, Clone, PartialEq)]
+pub struct Breach {
+    pub resource: &'static str,
+    pub used: f64,
+    pub limit: f64,
+}
+
+/// Resultado de chequear `ResourceUsage` contra `ResourceQuota`.
+#[derive(Debug, Clone, Default)]
+pub struct QuotaReport {
+    pub breaches: Vec<Breach>,
+}
+
+impl QuotaReport {
+    /// `true` si no hay ningún breach.
+    pub fn ok(&self) -> bool {
+        self.breaches.is_empty()
+    }
+}
+
+/// Compara uso contra cuota y reporta cada recurso excedido.
+pub fn check_quota(usage: &ResourceUsage, quota: &ResourceQuota) -> QuotaReport {
+    let mut breaches = Vec::new();
+    if let Some(limit) = quota.mem_bytes {
+        if usage.mem_bytes > limit {
+            breaches.push(Breach {
+                resource: "mem_bytes",
+                used: usage.mem_bytes as f64,
+                limit: limit as f64,
+            });
+        }
+    }
+    if let Some(limit) = quota.nproc {
+        if usage.nproc > limit {
+            breaches.push(Breach {
+                resource: "nproc",
+                used: usage.nproc as f64,
+                limit: limit as f64,
+            });
+        }
+    }
+    if let Some(limit) = quota.cpu_pct {
+        if usage.cpu_pct > limit {
+            breaches.push(Breach {
+                resource: "cpu_pct",
+                used: usage.cpu_pct,
+                limit,
+            });
+        }
+    }
+    QuotaReport { breaches }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn within_limits_is_ok() {
+        let usage = ResourceUsage { mem_bytes: 100, nproc: 2, cpu_pct: 50.0 };
+        let quota = ResourceQuota {
+            mem_bytes: Some(200), nproc: Some(4), cpu_pct: Some(90.0),
+        };
+        assert!(check_quota(&usage, &quota).ok());
+    }
+
+    #[test]
+    fn detects_mem_breach() {
+        let usage = ResourceUsage { mem_bytes: 300, nproc: 1, cpu_pct: 0.0 };
+        let quota = ResourceQuota { mem_bytes: Some(200), ..Default::default() };
+        let report = check_quota(&usage, &quota);
+        assert!(!report.ok());
+        assert_eq!(report.breaches[0].resource, "mem_bytes");
+    }
+
+    #[test]
+    fn no_limit_means_no_breach() {
+        let usage = ResourceUsage { mem_bytes: u64::MAX, nproc: 9999, cpu_pct: 999.0 };
+        assert!(check_quota(&usage, &ResourceQuota::default()).ok());
+    }
+
+    #[test]
+    fn multiple_breaches_reported() {
+        let usage = ResourceUsage { mem_bytes: 300, nproc: 10, cpu_pct: 200.0 };
+        let quota = ResourceQuota {
+            mem_bytes: Some(100), nproc: Some(2), cpu_pct: Some(100.0),
+        };
+        assert_eq!(check_quota(&usage, &quota).breaches.len(), 3);
+    }
+}
@@ -0,0 +1,115 @@
+//! Política de restart con conteo + backoff exponencial.
+
+use crate::backoff::Backoff;
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+/// Política declarativa de restart.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct RestartPolicy {
+    /// Si reintentar tras una salida con fallo.
+    pub on_failure: bool,
+    /// Máximo de restarts. `0` = infinito.
+    pub max_restarts: u32,
+}
+
+impl Default for RestartPolicy {
+    fn default() -> Self {
+        Self { on_failure: false, max_restarts: 0 }
+    }
+}
+
+/// Estado mutable de restart de una entidad supervisada. Combina la
+/// política con un `Backoff` y el conteo de intentos consumidos.
+#[derive(Debug, Clone)]
+pub struct RestartTracker {
+    policy: RestartPolicy,
+    backoff: Backoff,
+    count: u32,
+}
+
+impl RestartTracker {
+    pub fn new(policy: RestartPolicy, backoff: Backoff) -> Self {
+        Self { policy, backoff, count: 0 }
+    }
+
+    /// Registra un fallo. Devuelve `Some(delay)` con el backoff a esperar
+    /// antes del próximo intento, o `None` si no se debe reintentar
+    /// (política desactivada o `max_restarts` agotado).
+    pub fn on_failure(&mut self) -> Option<Duration> {
+        if !self.policy.on_failure {
+            return None;
+        }
+        if self.policy.max_restarts != 0 && self.count >= self.policy.max_restarts {
+            return None;
+        }
+        self.count += 1;
+        Some(self.backoff.next_delay())
+    }
+
+    /// Registra un éxito: resetea conteo y backoff.
+    pub fn on_success(&mut self) {
+        self.count = 0;
+        self.backoff.reset();
+    }
+
+    /// Cantidad de restarts consumidos.
+    pub fn count(&self) -> u32 {
+        self.count
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn backoff() -> Backoff {
+        Backoff::new(Duration::from_millis(100), Duration::from_secs(30))
+    }
+
+    #[test]
+    fn disabled_policy_never_restarts() {
+        let mut t = RestartTracker::new(
+            RestartPolicy { on_failure: false, max_restarts: 0 },
+            backoff(),
+        );
+        assert!(t.on_failure().is_none());
+    }
+
+    #[test]
+    fn respects_max_restarts() {
+        let mut t = RestartTracker::new(
+            RestartPolicy { on_failure: true, max_restarts: 3 },
+            backoff(),
+        );
+        assert!(t.on_failure().is_some());
+        assert!(t.on_failure().is_some());
+        assert!(t.on_failure().is_some());
+        assert!(t.on_failure().is_none()); // 4º agota la cuota
+        assert_eq!(t.count(), 3);
+    }
+
+    #[test]
+    fn infinite_when_max_zero() {
+        let mut t = RestartTracker::new(
+            RestartPolicy { on_failure: true, max_restarts: 0 },
+            backoff(),
+        );
+        for _ in 0..100 {
+            assert!(t.on_failure().is_some());
+        }
+    }
+
+    #[test]
+    fn backoff_escalates_then_success_resets() {
+        let mut t = RestartTracker::new(
+            RestartPolicy { on_failure: true, max_restarts: 0 },
+            backoff(),
+        );
+        assert_eq!(t.on_failure(), Some(Duration::from_millis(100)));
+        assert_eq!(t.on_failure(), Some(Duration::from_millis(200)));
+        t.on_success();
+        assert_eq!(t.count(), 0);
+        assert_eq!(t.on_failure(), Some(Duration::from_millis(100)));
+    }
+}
@@ -0,0 +1,65 @@
+//! Máquina de estados del ciclo de vida de una entidad supervisada.
+
+use serde::{Deserialize, Serialize};
+
+/// Estado de una entidad supervisada (proceso, workspace, sandbox, ...).
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub enum LifecycleState {
+    /// Creada, aún no arrancó.
+    Pending,
+    /// En ejecución.
+    Running,
+    /// Salió por sí misma con un código de salida.
+    Exited { code: i32 },
+    /// Falló (no llegó a correr, o crasheó de forma no capturable).
+    Failed { reason: String },
+    /// Terminada por el supervisor (SIGKILL / quota / drain).
+    Killed,
+}
+
+impl LifecycleState {
+    /// `true` si el estado es terminal (no habrá más transiciones sin
+    /// un restart explícito).
+    pub fn is_terminal(&self) -> bool {
+        matches!(
+            self,
+            LifecycleState::Exited { .. }
+                | LifecycleState::Failed { .. }
+                | LifecycleState::Killed
+        )
+    }
+
+    /// `true` si el estado terminal cuenta como fallo (dispara restart
+    /// si la política lo permite). `Exited { code: 0 }` NO es fallo.
+    pub fn is_failure(&self) -> bool {
+        match self {
+            LifecycleState::Exited { code } => *code != 0,
+            LifecycleState::Failed { .. } => true,
+            LifecycleState::Killed => false, // kill deliberado, no fallo
+            _ => false,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn terminal_detection() {
+        assert!(!LifecycleState::Pending.is_terminal());
+        assert!(!LifecycleState::Running.is_terminal());
+        assert!(LifecycleState::Exited { code: 0 }.is_terminal());
+        assert!(LifecycleState::Killed.is_terminal());
+        assert!(LifecycleState::Failed { reason: "x".into() }.is_terminal());
+    }
+
+    #[test]
+    fn failure_semantics() {
+        assert!(!LifecycleState::Exited { code: 0 }.is_failure());
+        assert!(LifecycleState::Exited { code: 1 }.is_failure());
+        assert!(LifecycleState::Failed { reason: "x".into() }.is_failure());
+        assert!(!LifecycleState::Killed.is_failure());
+        assert!(!LifecycleState::Running.is_failure());
+    }
+}
@@ -0,0 +1,57 @@
+//! Time-to-live anclado a un `Instant`.
+
+use std::time::{Duration, Instant};
+
+/// Time-to-live. Marca un instante límite tras el cual una entidad
+/// supervisada se considera vencida. Runtime-only (no serializable:
+/// `Instant` no tiene representación estable).
+#[derive(Debug, Clone, Copy)]
+pub struct Ttl {
+    deadline: Instant,
+}
+
+impl Ttl {
+    /// TTL que vence `lifetime` después de ahora.
+    pub fn new(lifetime: Duration) -> Self {
+        Self { deadline: Instant::now() + lifetime }
+    }
+
+    /// TTL con un deadline absoluto explícito.
+    pub fn from_deadline(deadline: Instant) -> Self {
+        Self { deadline }
+    }
+
+    /// `true` si el deadline ya pasó.
+    pub fn expired(&self) -> bool {
+        Instant::now() >= self.deadline
+    }
+
+    /// Tiempo restante hasta el deadline. `Duration::ZERO` si ya venció.
+    pub fn remaining(&self) -> Duration {
+        self.deadline.saturating_duration_since(Instant::now())
+    }
+
+    /// El instante límite.
+    pub fn deadline(&self) -> Instant {
+        self.deadline
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn fresh_ttl_not_expired() {
+        let t = Ttl::new(Duration::from_secs(60));
+        assert!(!t.expired());
+        assert!(t.remaining() > Duration::from_secs(58));
+    }
+
+    #[test]
+    fn past_deadline_is_expired() {
+        let t = Ttl::from_deadline(Instant::now() - Duration::from_secs(1));
+        assert!(t.expired());
+        assert_eq!(t.remaining(), Duration::ZERO);
+    }
+}