feat(shipote): multi-core CPU% + quota report + restart-on-failure (fase K)

- WorkspaceStats.cpu_cores via sysconf cacheado. CLI muestra
  `cpu_pct: 98.7 % (24.7% total / 4 cores)`.
- workspace_quota compara SomaSpec.rlimits contra accounting actual.
  Reporta breaches humanos. NO enforcement automático en v1.
- run_with_options(.., restart_on_failure): si exit != 0, reaper
  relaunch con backoff exponencial 200ms → 30s cap. Inner.restart_specs
  persiste el spec entre intentos.

81 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8,
shipote-core 22, shipote-discern 5, yahweh-provider-fs 3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
sergio
2026-05-11 01:32:39 +00:00
parent d8727a3038
commit 324a0c2d5d
5 changed files with 331 additions and 20 deletions
+207 -15
View File
@@ -94,6 +94,24 @@ struct Inner {
/// el reaper los borra (futuro). v1: viven hasta `stop_pipeline_flows`
/// explícito o hasta shutdown.
pipeline_flows: HashMap<Ulid, Vec<crate::flow_channel::FlowChannel>>,
/// Specs de comandos `run()` con `restart_on_failure=true`. Indexed
/// por command_id. Cuando `reap_dead` detecta exit!=0, se relauncha
/// con la misma spec (nuevo pid y nuevo command_id se asigna por
/// el nuevo state pero el restart_spec sigue ligado al original).
restart_specs: HashMap<Ulid, RestartSpec>,
}
#[derive(Debug, Clone)]
struct RestartSpec {
workspace: WorkspaceId,
exec: String,
argv: Vec<String>,
envp: Vec<(String, String)>,
/// Backoff inicial (ms). Crece exponencialmente hasta max_backoff_ms.
backoff_ms: u64,
max_backoff_ms: u64,
/// Cantidad de restarts ya ejecutados (para tracking).
restart_count: u32,
}
#[derive(Debug, Clone)]
@@ -179,6 +197,7 @@ impl WorkspaceManager {
workspaces: HashMap::new(),
saved_pipelines: HashMap::new(),
pipeline_flows: HashMap::new(),
restart_specs: HashMap::new(),
})),
incarnator: Arc::new(Incarnator::new(cfg)),
}
@@ -353,6 +372,39 @@ impl WorkspaceManager {
.map(|w| w.spec.label.clone())
}
/// Compara accounting real (RSS, commands_alive) contra los rlimits
/// declarados en `SomaSpec`. Devuelve violaciones humanizadas. NO
/// hace enforcement automático.
pub async fn workspace_quota(&self, id: WorkspaceId) -> Option<stats::QuotaReport> {
let stats_now = self.workspace_stats(id).await?;
let g = self.inner.lock().await;
let ws = g.workspaces.get(&id)?;
let rl = &ws.spec.soma.rlimits;
let mut report = stats::QuotaReport {
mem_limit: rl.mem_bytes,
nproc_limit: rl.nproc,
breaches: Vec::new(),
};
if let (Some(limit), Some(used)) = (rl.mem_bytes, stats_now.rss_bytes) {
if used > limit {
report.breaches.push(format!(
"memory: {:.2} MiB > {:.2} MiB limit",
used as f64 / 1024.0 / 1024.0,
limit as f64 / 1024.0 / 1024.0,
));
}
}
if let Some(limit) = rl.nproc {
if stats_now.commands_alive > limit {
report.breaches.push(format!(
"nproc: {} alive > {} limit",
stats_now.commands_alive, limit
));
}
}
Some(report)
}
/// Estadísticas de recursos del workspace: RSS + CPU agregado de sus
/// comandos vivos. Lee `/proc/<pid>/` directamente; si el spec declara
/// `soma.cgroup.path`, también intenta el cgroup (más preciso, incluye
@@ -531,6 +583,20 @@ impl WorkspaceManager {
exec: String,
argv: Vec<String>,
envp: Vec<(String, String)>,
) -> Result<CommandSummary, CoreError> {
self.run_with_options(id, exec, argv, envp, false).await
}
/// Variante con `restart_on_failure`: si el comando muere con
/// exit_status != 0, el reaper lo relauncha con backoff exponencial
/// (200ms → 400 → 800 → … cap 30s).
pub async fn run_with_options(
&self,
id: WorkspaceId,
exec: String,
argv: Vec<String>,
envp: Vec<(String, String)>,
restart_on_failure: bool,
) -> Result<CommandSummary, CoreError> {
let workspace_label = {
let g = self.inner.lock().await;
@@ -593,6 +659,23 @@ impl WorkspaceManager {
},
);
}
if restart_on_failure {
// Reextract exec/argv/envp del payload del CommandRef.
if let Payload::Native { exec, argv, envp } = &cmd_ref.payload {
g.restart_specs.insert(
cmd_id,
RestartSpec {
workspace: id,
exec: exec.clone(),
argv: argv.clone(),
envp: envp.clone(),
backoff_ms: 200,
max_backoff_ms: 30_000,
restart_count: 0,
},
);
}
}
for d in &out.degradations {
warn!(?d, %id, "command incarnation degradation");
}
@@ -693,25 +776,99 @@ impl WorkspaceManager {
/// Cosecha hijos terminados (no-bloqueante). Llamar periódicamente desde
/// el daemon o ante SIGCHLD. Marca `alive=false` y guarda exit_status.
pub async fn reap_dead(&self) {
let mut g = self.inner.lock().await;
for ws in g.workspaces.values_mut() {
for cmd in ws.commands.values_mut() {
if !cmd.alive {
continue;
}
match waitpid(cmd.pid, Some(WaitPidFlag::WNOHANG)) {
Ok(WaitStatus::Exited(_, code)) => {
cmd.alive = false;
cmd.exit_status = Some(code);
pub async fn reap_dead(self: &Arc<Self>) {
let mut to_restart: Vec<RestartSpec> = Vec::new();
{
let mut g = self.inner.lock().await;
for ws in g.workspaces.values_mut() {
for cmd in ws.commands.values_mut() {
if !cmd.alive {
continue;
}
Ok(WaitStatus::Signaled(_, sig, _)) => {
cmd.alive = false;
cmd.exit_status = Some(128 + (sig as i32));
match waitpid(cmd.pid, Some(WaitPidFlag::WNOHANG)) {
Ok(WaitStatus::Exited(_, code)) => {
cmd.alive = false;
cmd.exit_status = Some(code);
}
Ok(WaitStatus::Signaled(_, sig, _)) => {
cmd.alive = false;
cmd.exit_status = Some(128 + (sig as i32));
}
_ => {}
}
_ => {}
}
}
// Detectar restart_specs cuyo command_id ya está dead con exit!=0.
let mut to_remove: Vec<Ulid> = Vec::new();
for (cmd_id, spec) in g.restart_specs.iter() {
let mut should_restart = false;
let mut should_drop = false;
'outer: for ws in g.workspaces.values() {
if let Some(cmd) = ws.commands.get(cmd_id) {
if !cmd.alive {
match cmd.exit_status {
Some(0) => should_drop = true,
Some(_) => should_restart = true,
None => {}
}
break 'outer;
}
}
}
if should_drop {
to_remove.push(*cmd_id);
} else if should_restart {
to_restart.push(spec.clone());
to_remove.push(*cmd_id);
}
}
for id in to_remove {
g.restart_specs.remove(&id);
}
}
// Schedule restart fuera del lock.
for mut spec in to_restart {
let mgr = self.clone();
let backoff = std::time::Duration::from_millis(spec.backoff_ms);
// Subir el backoff para la PRÓXIMA falla, no esta.
spec.backoff_ms = (spec.backoff_ms * 2).min(spec.max_backoff_ms);
spec.restart_count += 1;
let restart_n = spec.restart_count;
tokio::spawn(async move {
tokio::time::sleep(backoff).await;
info!(
backoff_ms = backoff.as_millis() as u64,
restart = restart_n,
"restarting failed command"
);
let workspace = spec.workspace;
if let Err(e) = mgr
.run_with_options(workspace, spec.exec.clone(), spec.argv.clone(), spec.envp.clone(), true)
.await
{
warn!(?e, "restart failed to launch");
return;
}
// Preservar backoff acumulado: localizar el nuevo command_id
// (el más reciente vivo en el workspace) y sobreescribir.
let new_cmd_id = {
let g = mgr.inner.lock().await;
g.workspaces.get(&workspace).and_then(|ws| {
ws.commands
.values()
.filter(|c| c.alive)
.max_by_key(|c| c.id)
.map(|c| c.id)
})
};
if let Some(new_id) = new_cmd_id {
let mut g = mgr.inner.lock().await;
if let Some(existing) = g.restart_specs.get_mut(&new_id) {
existing.backoff_ms = spec.backoff_ms;
existing.restart_count = spec.restart_count;
}
}
});
}
}
}
@@ -850,6 +1007,41 @@ mod tests {
panic!("logs never captured on both streams");
}
#[tokio::test]
async fn restart_on_failure_relaunches_failing_command() {
let mgr = Arc::new(WorkspaceManager::new(IncarnatorConfig::default()));
let spec = WorkspaceSpec {
label: "restart".into(),
soma: Default::default(),
permissions: Default::default(),
ttl: None,
flow_dirs: vec![],
on_exit: shipote_card::ExitPolicy::Reap,
};
let (id, _) = mgr.create(spec).await.unwrap();
// /bin/false sale con exit=1. Con restart_on_failure=true debería
// relanzarse al menos 1 vez (tras el backoff inicial de 200ms).
let summary = mgr
.run_with_options(id, "/bin/false".into(), vec![], vec![], true)
.await
.unwrap();
let original_id = summary.id;
// Esperamos ~500ms para que termine + reap + restart corra.
for _ in 0..30 {
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
mgr.reap_dead().await;
let g = mgr.inner.lock().await;
if let Some(ws) = g.workspaces.get(&id) {
let new_cmds: Vec<_> = ws.commands.keys().filter(|k| **k != original_id).collect();
if !new_cmds.is_empty() {
// Hay un nuevo command_id → restart funcionó.
return;
}
}
}
panic!("restart never launched a new command");
}
#[tokio::test]
async fn run_true_in_workspace() {
let mgr = Arc::new(WorkspaceManager::new(IncarnatorConfig::default()));