diff --git a/Cargo.lock b/Cargo.lock index a64c1e8..97513ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10127,6 +10127,13 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "sandokan-lifecycle" +version = "0.1.0" +dependencies = [ + "serde", +] + [[package]] name = "saphyr-parser" version = "0.0.6" diff --git a/Cargo.toml b/Cargo.toml index 4eca129..da692e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ members = [ "crates/runtime/arje-brain-audit", "crates/runtime/arje-brain", "crates/runtime/arje-echo", + "crates/runtime/sandokan-lifecycle", # ============================================================ # compat/ — Shims D-Bus para correr software systemd-aware diff --git a/crates/runtime/SDD.md b/crates/runtime/SDD.md index 06a2c7c..018d55f 100644 --- a/crates/runtime/SDD.md +++ b/crates/runtime/SDD.md @@ -16,6 +16,22 @@ rule engine + audit log, y un ente de smoke test. | `arje-brain-audit` | lib | Audit chain con hashes anclados al CAS | | `arje-brain` | lib | Integración: introspect + autopromote + metrics | | `arje-echo` | bin | Ente prueba — provee `Capability::Endpoint(echo)` | +| `sandokan-lifecycle` | lib | Primitivas de lifecycle agnósticas (ver abajo) | + +## sandokan-lifecycle + +Library de primitivas de ciclo de vida agnósticas (sin syscalls, sin +proceso, sin UI — solo cálculo). Consumible por cualquier supervisor: +`shuma`, `matilda` Ghost, `charka-shadow`, `mirada`. + +- `Backoff` — backoff exponencial con tope. +- `Ttl` — time-to-live anclado a `Instant`. +- `ResourceQuota` + `check_quota` — cuotas de recursos + breaches. +- `RestartPolicy` + `RestartTracker` — restart con conteo + backoff. +- `LifecycleState` — máquina de estados (Pending/Running/Exited/Failed/Killed). + +15 tests verdes. **Pendiente (A4.2)**: migrar `shuma-core::WorkspaceManager` +para que consuma estas primitivas en lugar de su implementación inline. ## Dependencias diff --git a/crates/runtime/sandokan-lifecycle/Cargo.toml b/crates/runtime/sandokan-lifecycle/Cargo.toml new file mode 100644 index 0000000..18876e2 --- /dev/null +++ b/crates/runtime/sandokan-lifecycle/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "sandokan-lifecycle" +description = "Primitivas de ciclo de vida agnósticas: backoff exponencial, TTL, cuotas de recursos, política de restart, estado. Sin deps de proceso ni de UI." +version.workspace = true +edition.workspace = true +license.workspace = true +authors.workspace = true +publish.workspace = true + +[dependencies] +serde = { workspace = true } diff --git a/crates/runtime/sandokan-lifecycle/src/backoff.rs b/crates/runtime/sandokan-lifecycle/src/backoff.rs new file mode 100644 index 0000000..927686e --- /dev/null +++ b/crates/runtime/sandokan-lifecycle/src/backoff.rs @@ -0,0 +1,68 @@ +//! Backoff exponencial con tope. + +use std::time::Duration; + +/// Calculador de backoff exponencial. Cada `next_delay()` devuelve el +/// delay actual y luego lo duplica, hasta saturar en `max`. +#[derive(Debug, Clone)] +pub struct Backoff { + base: Duration, + max: Duration, + current: Duration, +} + +impl Backoff { + /// Crea un backoff que arranca en `base` y satura en `max`. + /// Si `base > max`, `base` se clampa a `max`. + pub fn new(base: Duration, max: Duration) -> Self { + let base = base.min(max); + Self { base, max, current: base } + } + + /// Devuelve el delay actual y escala el siguiente (×2, capeado a `max`). + pub fn next_delay(&mut self) -> Duration { + let delay = self.current; + self.current = (self.current * 2).min(self.max); + delay + } + + /// Vuelve al delay base (tras un éxito). + pub fn reset(&mut self) { + self.current = self.base; + } + + /// Delay que devolvería el próximo `next_delay()` sin consumirlo. + pub fn peek(&self) -> Duration { + self.current + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escalates_then_caps() { + let mut b = Backoff::new(Duration::from_millis(100), Duration::from_millis(800)); + assert_eq!(b.next_delay(), Duration::from_millis(100)); + assert_eq!(b.next_delay(), Duration::from_millis(200)); + assert_eq!(b.next_delay(), Duration::from_millis(400)); + assert_eq!(b.next_delay(), Duration::from_millis(800)); + assert_eq!(b.next_delay(), Duration::from_millis(800)); // capeado + } + + #[test] + fn reset_returns_to_base() { + let mut b = Backoff::new(Duration::from_millis(100), Duration::from_secs(30)); + b.next_delay(); + b.next_delay(); + b.reset(); + assert_eq!(b.next_delay(), Duration::from_millis(100)); + } + + #[test] + fn base_clamped_to_max() { + let mut b = Backoff::new(Duration::from_secs(10), Duration::from_secs(1)); + assert_eq!(b.next_delay(), Duration::from_secs(1)); + } +} diff --git a/crates/runtime/sandokan-lifecycle/src/lib.rs b/crates/runtime/sandokan-lifecycle/src/lib.rs new file mode 100644 index 0000000..9bf1cbd --- /dev/null +++ b/crates/runtime/sandokan-lifecycle/src/lib.rs @@ -0,0 +1,23 @@ +//! sandokan-lifecycle — primitivas de ciclo de vida agnósticas. +//! +//! Lógica pura reutilizable por cualquier supervisor de procesos +//! (shuma, matilda Ghost, charka-shadow, mirada). Sin dependencias de +//! syscalls, proceso, ni UI: solo cálculo. +//! +//! - [`backoff`] — backoff exponencial con tope. +//! - [`ttl`] — time-to-live anclado a un `Instant`. +//! - [`quota`] — cuotas de recursos + chequeo de breaches. +//! - [`restart`] — política de restart con conteo + backoff. +//! - [`state`] — máquina de estados del ciclo de vida. + +pub mod backoff; +pub mod ttl; +pub mod quota; +pub mod restart; +pub mod state; + +pub use backoff::Backoff; +pub use ttl::Ttl; +pub use quota::{Breach, QuotaAction, QuotaReport, ResourceQuota, ResourceUsage, check_quota}; +pub use restart::{RestartPolicy, RestartTracker}; +pub use state::LifecycleState; diff --git a/crates/runtime/sandokan-lifecycle/src/quota.rs b/crates/runtime/sandokan-lifecycle/src/quota.rs new file mode 100644 index 0000000..966eca4 --- /dev/null +++ b/crates/runtime/sandokan-lifecycle/src/quota.rs @@ -0,0 +1,124 @@ +//! Cuotas de recursos + chequeo de breaches. + +use serde::{Deserialize, Serialize}; + +/// Acción a tomar cuando una cuota se excede. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +pub enum QuotaAction { + /// No hacer nada (accounting puro). + #[default] + None, + /// Sólo loggear el breach. + Log, + /// Terminar la entidad supervisada. + Kill, +} + +/// Límites declarativos de recursos. `None` = sin límite para ese recurso. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ResourceQuota { + pub mem_bytes: Option, + pub nproc: Option, + /// Porcentaje de CPU (100.0 = 1 core saturado). + pub cpu_pct: Option, +} + +/// Uso de recursos medido en un instante. +#[derive(Debug, Clone, Default)] +pub struct ResourceUsage { + pub mem_bytes: u64, + pub nproc: u32, + pub cpu_pct: f64, +} + +/// Un recurso que excedió su límite. +#[derive(Debug, Clone, PartialEq)] +pub struct Breach { + pub resource: &'static str, + pub used: f64, + pub limit: f64, +} + +/// Resultado de chequear `ResourceUsage` contra `ResourceQuota`. +#[derive(Debug, Clone, Default)] +pub struct QuotaReport { + pub breaches: Vec, +} + +impl QuotaReport { + /// `true` si no hay ningún breach. + pub fn ok(&self) -> bool { + self.breaches.is_empty() + } +} + +/// Compara uso contra cuota y reporta cada recurso excedido. +pub fn check_quota(usage: &ResourceUsage, quota: &ResourceQuota) -> QuotaReport { + let mut breaches = Vec::new(); + if let Some(limit) = quota.mem_bytes { + if usage.mem_bytes > limit { + breaches.push(Breach { + resource: "mem_bytes", + used: usage.mem_bytes as f64, + limit: limit as f64, + }); + } + } + if let Some(limit) = quota.nproc { + if usage.nproc > limit { + breaches.push(Breach { + resource: "nproc", + used: usage.nproc as f64, + limit: limit as f64, + }); + } + } + if let Some(limit) = quota.cpu_pct { + if usage.cpu_pct > limit { + breaches.push(Breach { + resource: "cpu_pct", + used: usage.cpu_pct, + limit, + }); + } + } + QuotaReport { breaches } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn within_limits_is_ok() { + let usage = ResourceUsage { mem_bytes: 100, nproc: 2, cpu_pct: 50.0 }; + let quota = ResourceQuota { + mem_bytes: Some(200), nproc: Some(4), cpu_pct: Some(90.0), + }; + assert!(check_quota(&usage, "a).ok()); + } + + #[test] + fn detects_mem_breach() { + let usage = ResourceUsage { mem_bytes: 300, nproc: 1, cpu_pct: 0.0 }; + let quota = ResourceQuota { mem_bytes: Some(200), ..Default::default() }; + let report = check_quota(&usage, "a); + assert!(!report.ok()); + assert_eq!(report.breaches[0].resource, "mem_bytes"); + } + + #[test] + fn no_limit_means_no_breach() { + let usage = ResourceUsage { mem_bytes: u64::MAX, nproc: 9999, cpu_pct: 999.0 }; + assert!(check_quota(&usage, &ResourceQuota::default()).ok()); + } + + #[test] + fn multiple_breaches_reported() { + let usage = ResourceUsage { mem_bytes: 300, nproc: 10, cpu_pct: 200.0 }; + let quota = ResourceQuota { + mem_bytes: Some(100), nproc: Some(2), cpu_pct: Some(100.0), + }; + assert_eq!(check_quota(&usage, "a).breaches.len(), 3); + } +} diff --git a/crates/runtime/sandokan-lifecycle/src/restart.rs b/crates/runtime/sandokan-lifecycle/src/restart.rs new file mode 100644 index 0000000..6235775 --- /dev/null +++ b/crates/runtime/sandokan-lifecycle/src/restart.rs @@ -0,0 +1,115 @@ +//! Política de restart con conteo + backoff exponencial. + +use crate::backoff::Backoff; +use serde::{Deserialize, Serialize}; +use std::time::Duration; + +/// Política declarativa de restart. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RestartPolicy { + /// Si reintentar tras una salida con fallo. + pub on_failure: bool, + /// Máximo de restarts. `0` = infinito. + pub max_restarts: u32, +} + +impl Default for RestartPolicy { + fn default() -> Self { + Self { on_failure: false, max_restarts: 0 } + } +} + +/// Estado mutable de restart de una entidad supervisada. Combina la +/// política con un `Backoff` y el conteo de intentos consumidos. +#[derive(Debug, Clone)] +pub struct RestartTracker { + policy: RestartPolicy, + backoff: Backoff, + count: u32, +} + +impl RestartTracker { + pub fn new(policy: RestartPolicy, backoff: Backoff) -> Self { + Self { policy, backoff, count: 0 } + } + + /// Registra un fallo. Devuelve `Some(delay)` con el backoff a esperar + /// antes del próximo intento, o `None` si no se debe reintentar + /// (política desactivada o `max_restarts` agotado). + pub fn on_failure(&mut self) -> Option { + if !self.policy.on_failure { + return None; + } + if self.policy.max_restarts != 0 && self.count >= self.policy.max_restarts { + return None; + } + self.count += 1; + Some(self.backoff.next_delay()) + } + + /// Registra un éxito: resetea conteo y backoff. + pub fn on_success(&mut self) { + self.count = 0; + self.backoff.reset(); + } + + /// Cantidad de restarts consumidos. + pub fn count(&self) -> u32 { + self.count + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn backoff() -> Backoff { + Backoff::new(Duration::from_millis(100), Duration::from_secs(30)) + } + + #[test] + fn disabled_policy_never_restarts() { + let mut t = RestartTracker::new( + RestartPolicy { on_failure: false, max_restarts: 0 }, + backoff(), + ); + assert!(t.on_failure().is_none()); + } + + #[test] + fn respects_max_restarts() { + let mut t = RestartTracker::new( + RestartPolicy { on_failure: true, max_restarts: 3 }, + backoff(), + ); + assert!(t.on_failure().is_some()); + assert!(t.on_failure().is_some()); + assert!(t.on_failure().is_some()); + assert!(t.on_failure().is_none()); // 4º agota la cuota + assert_eq!(t.count(), 3); + } + + #[test] + fn infinite_when_max_zero() { + let mut t = RestartTracker::new( + RestartPolicy { on_failure: true, max_restarts: 0 }, + backoff(), + ); + for _ in 0..100 { + assert!(t.on_failure().is_some()); + } + } + + #[test] + fn backoff_escalates_then_success_resets() { + let mut t = RestartTracker::new( + RestartPolicy { on_failure: true, max_restarts: 0 }, + backoff(), + ); + assert_eq!(t.on_failure(), Some(Duration::from_millis(100))); + assert_eq!(t.on_failure(), Some(Duration::from_millis(200))); + t.on_success(); + assert_eq!(t.count(), 0); + assert_eq!(t.on_failure(), Some(Duration::from_millis(100))); + } +} diff --git a/crates/runtime/sandokan-lifecycle/src/state.rs b/crates/runtime/sandokan-lifecycle/src/state.rs new file mode 100644 index 0000000..227237a --- /dev/null +++ b/crates/runtime/sandokan-lifecycle/src/state.rs @@ -0,0 +1,65 @@ +//! Máquina de estados del ciclo de vida de una entidad supervisada. + +use serde::{Deserialize, Serialize}; + +/// Estado de una entidad supervisada (proceso, workspace, sandbox, ...). +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum LifecycleState { + /// Creada, aún no arrancó. + Pending, + /// En ejecución. + Running, + /// Salió por sí misma con un código de salida. + Exited { code: i32 }, + /// Falló (no llegó a correr, o crasheó de forma no capturable). + Failed { reason: String }, + /// Terminada por el supervisor (SIGKILL / quota / drain). + Killed, +} + +impl LifecycleState { + /// `true` si el estado es terminal (no habrá más transiciones sin + /// un restart explícito). + pub fn is_terminal(&self) -> bool { + matches!( + self, + LifecycleState::Exited { .. } + | LifecycleState::Failed { .. } + | LifecycleState::Killed + ) + } + + /// `true` si el estado terminal cuenta como fallo (dispara restart + /// si la política lo permite). `Exited { code: 0 }` NO es fallo. + pub fn is_failure(&self) -> bool { + match self { + LifecycleState::Exited { code } => *code != 0, + LifecycleState::Failed { .. } => true, + LifecycleState::Killed => false, // kill deliberado, no fallo + _ => false, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn terminal_detection() { + assert!(!LifecycleState::Pending.is_terminal()); + assert!(!LifecycleState::Running.is_terminal()); + assert!(LifecycleState::Exited { code: 0 }.is_terminal()); + assert!(LifecycleState::Killed.is_terminal()); + assert!(LifecycleState::Failed { reason: "x".into() }.is_terminal()); + } + + #[test] + fn failure_semantics() { + assert!(!LifecycleState::Exited { code: 0 }.is_failure()); + assert!(LifecycleState::Exited { code: 1 }.is_failure()); + assert!(LifecycleState::Failed { reason: "x".into() }.is_failure()); + assert!(!LifecycleState::Killed.is_failure()); + assert!(!LifecycleState::Running.is_failure()); + } +} diff --git a/crates/runtime/sandokan-lifecycle/src/ttl.rs b/crates/runtime/sandokan-lifecycle/src/ttl.rs new file mode 100644 index 0000000..3c2b7ca --- /dev/null +++ b/crates/runtime/sandokan-lifecycle/src/ttl.rs @@ -0,0 +1,57 @@ +//! Time-to-live anclado a un `Instant`. + +use std::time::{Duration, Instant}; + +/// Time-to-live. Marca un instante límite tras el cual una entidad +/// supervisada se considera vencida. Runtime-only (no serializable: +/// `Instant` no tiene representación estable). +#[derive(Debug, Clone, Copy)] +pub struct Ttl { + deadline: Instant, +} + +impl Ttl { + /// TTL que vence `lifetime` después de ahora. + pub fn new(lifetime: Duration) -> Self { + Self { deadline: Instant::now() + lifetime } + } + + /// TTL con un deadline absoluto explícito. + pub fn from_deadline(deadline: Instant) -> Self { + Self { deadline } + } + + /// `true` si el deadline ya pasó. + pub fn expired(&self) -> bool { + Instant::now() >= self.deadline + } + + /// Tiempo restante hasta el deadline. `Duration::ZERO` si ya venció. + pub fn remaining(&self) -> Duration { + self.deadline.saturating_duration_since(Instant::now()) + } + + /// El instante límite. + pub fn deadline(&self) -> Instant { + self.deadline + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn fresh_ttl_not_expired() { + let t = Ttl::new(Duration::from_secs(60)); + assert!(!t.expired()); + assert!(t.remaining() > Duration::from_secs(58)); + } + + #[test] + fn past_deadline_is_expired() { + let t = Ttl::from_deadline(Instant::now() - Duration::from_secs(1)); + assert!(t.expired()); + assert_eq!(t.remaining(), Duration::ZERO); + } +}