feat(shipote): multi-core CPU% + quota report + restart-on-failure (fase K)
- WorkspaceStats.cpu_cores via sysconf cacheado. CLI muestra `cpu_pct: 98.7 % (24.7% total / 4 cores)`. - workspace_quota compara SomaSpec.rlimits contra accounting actual. Reporta breaches humanos. NO enforcement automático en v1. - run_with_options(.., restart_on_failure): si exit != 0, reaper relaunch con backoff exponencial 200ms → 30s cap. Inner.restart_specs persiste el spec entre intentos. 81 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 22, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -94,6 +94,24 @@ struct Inner {
|
||||
/// el reaper los borra (futuro). v1: viven hasta `stop_pipeline_flows`
|
||||
/// explícito o hasta shutdown.
|
||||
pipeline_flows: HashMap<Ulid, Vec<crate::flow_channel::FlowChannel>>,
|
||||
/// Specs de comandos `run()` con `restart_on_failure=true`. Indexed
|
||||
/// por command_id. Cuando `reap_dead` detecta exit!=0, se relauncha
|
||||
/// con la misma spec (nuevo pid y nuevo command_id se asigna por
|
||||
/// el nuevo state pero el restart_spec sigue ligado al original).
|
||||
restart_specs: HashMap<Ulid, RestartSpec>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct RestartSpec {
|
||||
workspace: WorkspaceId,
|
||||
exec: String,
|
||||
argv: Vec<String>,
|
||||
envp: Vec<(String, String)>,
|
||||
/// Backoff inicial (ms). Crece exponencialmente hasta max_backoff_ms.
|
||||
backoff_ms: u64,
|
||||
max_backoff_ms: u64,
|
||||
/// Cantidad de restarts ya ejecutados (para tracking).
|
||||
restart_count: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -179,6 +197,7 @@ impl WorkspaceManager {
|
||||
workspaces: HashMap::new(),
|
||||
saved_pipelines: HashMap::new(),
|
||||
pipeline_flows: HashMap::new(),
|
||||
restart_specs: HashMap::new(),
|
||||
})),
|
||||
incarnator: Arc::new(Incarnator::new(cfg)),
|
||||
}
|
||||
@@ -353,6 +372,39 @@ impl WorkspaceManager {
|
||||
.map(|w| w.spec.label.clone())
|
||||
}
|
||||
|
||||
/// Compara accounting real (RSS, commands_alive) contra los rlimits
|
||||
/// declarados en `SomaSpec`. Devuelve violaciones humanizadas. NO
|
||||
/// hace enforcement automático.
|
||||
pub async fn workspace_quota(&self, id: WorkspaceId) -> Option<stats::QuotaReport> {
|
||||
let stats_now = self.workspace_stats(id).await?;
|
||||
let g = self.inner.lock().await;
|
||||
let ws = g.workspaces.get(&id)?;
|
||||
let rl = &ws.spec.soma.rlimits;
|
||||
let mut report = stats::QuotaReport {
|
||||
mem_limit: rl.mem_bytes,
|
||||
nproc_limit: rl.nproc,
|
||||
breaches: Vec::new(),
|
||||
};
|
||||
if let (Some(limit), Some(used)) = (rl.mem_bytes, stats_now.rss_bytes) {
|
||||
if used > limit {
|
||||
report.breaches.push(format!(
|
||||
"memory: {:.2} MiB > {:.2} MiB limit",
|
||||
used as f64 / 1024.0 / 1024.0,
|
||||
limit as f64 / 1024.0 / 1024.0,
|
||||
));
|
||||
}
|
||||
}
|
||||
if let Some(limit) = rl.nproc {
|
||||
if stats_now.commands_alive > limit {
|
||||
report.breaches.push(format!(
|
||||
"nproc: {} alive > {} limit",
|
||||
stats_now.commands_alive, limit
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(report)
|
||||
}
|
||||
|
||||
/// Estadísticas de recursos del workspace: RSS + CPU agregado de sus
|
||||
/// comandos vivos. Lee `/proc/<pid>/` directamente; si el spec declara
|
||||
/// `soma.cgroup.path`, también intenta el cgroup (más preciso, incluye
|
||||
@@ -531,6 +583,20 @@ impl WorkspaceManager {
|
||||
exec: String,
|
||||
argv: Vec<String>,
|
||||
envp: Vec<(String, String)>,
|
||||
) -> Result<CommandSummary, CoreError> {
|
||||
self.run_with_options(id, exec, argv, envp, false).await
|
||||
}
|
||||
|
||||
/// Variante con `restart_on_failure`: si el comando muere con
|
||||
/// exit_status != 0, el reaper lo relauncha con backoff exponencial
|
||||
/// (200ms → 400 → 800 → … cap 30s).
|
||||
pub async fn run_with_options(
|
||||
&self,
|
||||
id: WorkspaceId,
|
||||
exec: String,
|
||||
argv: Vec<String>,
|
||||
envp: Vec<(String, String)>,
|
||||
restart_on_failure: bool,
|
||||
) -> Result<CommandSummary, CoreError> {
|
||||
let workspace_label = {
|
||||
let g = self.inner.lock().await;
|
||||
@@ -593,6 +659,23 @@ impl WorkspaceManager {
|
||||
},
|
||||
);
|
||||
}
|
||||
if restart_on_failure {
|
||||
// Reextract exec/argv/envp del payload del CommandRef.
|
||||
if let Payload::Native { exec, argv, envp } = &cmd_ref.payload {
|
||||
g.restart_specs.insert(
|
||||
cmd_id,
|
||||
RestartSpec {
|
||||
workspace: id,
|
||||
exec: exec.clone(),
|
||||
argv: argv.clone(),
|
||||
envp: envp.clone(),
|
||||
backoff_ms: 200,
|
||||
max_backoff_ms: 30_000,
|
||||
restart_count: 0,
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
for d in &out.degradations {
|
||||
warn!(?d, %id, "command incarnation degradation");
|
||||
}
|
||||
@@ -693,25 +776,99 @@ impl WorkspaceManager {
|
||||
|
||||
/// Cosecha hijos terminados (no-bloqueante). Llamar periódicamente desde
|
||||
/// el daemon o ante SIGCHLD. Marca `alive=false` y guarda exit_status.
|
||||
pub async fn reap_dead(&self) {
|
||||
let mut g = self.inner.lock().await;
|
||||
for ws in g.workspaces.values_mut() {
|
||||
for cmd in ws.commands.values_mut() {
|
||||
if !cmd.alive {
|
||||
continue;
|
||||
}
|
||||
match waitpid(cmd.pid, Some(WaitPidFlag::WNOHANG)) {
|
||||
Ok(WaitStatus::Exited(_, code)) => {
|
||||
cmd.alive = false;
|
||||
cmd.exit_status = Some(code);
|
||||
pub async fn reap_dead(self: &Arc<Self>) {
|
||||
let mut to_restart: Vec<RestartSpec> = Vec::new();
|
||||
{
|
||||
let mut g = self.inner.lock().await;
|
||||
for ws in g.workspaces.values_mut() {
|
||||
for cmd in ws.commands.values_mut() {
|
||||
if !cmd.alive {
|
||||
continue;
|
||||
}
|
||||
Ok(WaitStatus::Signaled(_, sig, _)) => {
|
||||
cmd.alive = false;
|
||||
cmd.exit_status = Some(128 + (sig as i32));
|
||||
match waitpid(cmd.pid, Some(WaitPidFlag::WNOHANG)) {
|
||||
Ok(WaitStatus::Exited(_, code)) => {
|
||||
cmd.alive = false;
|
||||
cmd.exit_status = Some(code);
|
||||
}
|
||||
Ok(WaitStatus::Signaled(_, sig, _)) => {
|
||||
cmd.alive = false;
|
||||
cmd.exit_status = Some(128 + (sig as i32));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
// Detectar restart_specs cuyo command_id ya está dead con exit!=0.
|
||||
let mut to_remove: Vec<Ulid> = Vec::new();
|
||||
for (cmd_id, spec) in g.restart_specs.iter() {
|
||||
let mut should_restart = false;
|
||||
let mut should_drop = false;
|
||||
'outer: for ws in g.workspaces.values() {
|
||||
if let Some(cmd) = ws.commands.get(cmd_id) {
|
||||
if !cmd.alive {
|
||||
match cmd.exit_status {
|
||||
Some(0) => should_drop = true,
|
||||
Some(_) => should_restart = true,
|
||||
None => {}
|
||||
}
|
||||
break 'outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
if should_drop {
|
||||
to_remove.push(*cmd_id);
|
||||
} else if should_restart {
|
||||
to_restart.push(spec.clone());
|
||||
to_remove.push(*cmd_id);
|
||||
}
|
||||
}
|
||||
for id in to_remove {
|
||||
g.restart_specs.remove(&id);
|
||||
}
|
||||
}
|
||||
// Schedule restart fuera del lock.
|
||||
for mut spec in to_restart {
|
||||
let mgr = self.clone();
|
||||
let backoff = std::time::Duration::from_millis(spec.backoff_ms);
|
||||
// Subir el backoff para la PRÓXIMA falla, no esta.
|
||||
spec.backoff_ms = (spec.backoff_ms * 2).min(spec.max_backoff_ms);
|
||||
spec.restart_count += 1;
|
||||
let restart_n = spec.restart_count;
|
||||
tokio::spawn(async move {
|
||||
tokio::time::sleep(backoff).await;
|
||||
info!(
|
||||
backoff_ms = backoff.as_millis() as u64,
|
||||
restart = restart_n,
|
||||
"restarting failed command"
|
||||
);
|
||||
let workspace = spec.workspace;
|
||||
if let Err(e) = mgr
|
||||
.run_with_options(workspace, spec.exec.clone(), spec.argv.clone(), spec.envp.clone(), true)
|
||||
.await
|
||||
{
|
||||
warn!(?e, "restart failed to launch");
|
||||
return;
|
||||
}
|
||||
// Preservar backoff acumulado: localizar el nuevo command_id
|
||||
// (el más reciente vivo en el workspace) y sobreescribir.
|
||||
let new_cmd_id = {
|
||||
let g = mgr.inner.lock().await;
|
||||
g.workspaces.get(&workspace).and_then(|ws| {
|
||||
ws.commands
|
||||
.values()
|
||||
.filter(|c| c.alive)
|
||||
.max_by_key(|c| c.id)
|
||||
.map(|c| c.id)
|
||||
})
|
||||
};
|
||||
if let Some(new_id) = new_cmd_id {
|
||||
let mut g = mgr.inner.lock().await;
|
||||
if let Some(existing) = g.restart_specs.get_mut(&new_id) {
|
||||
existing.backoff_ms = spec.backoff_ms;
|
||||
existing.restart_count = spec.restart_count;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -850,6 +1007,41 @@ mod tests {
|
||||
panic!("logs never captured on both streams");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn restart_on_failure_relaunches_failing_command() {
|
||||
let mgr = Arc::new(WorkspaceManager::new(IncarnatorConfig::default()));
|
||||
let spec = WorkspaceSpec {
|
||||
label: "restart".into(),
|
||||
soma: Default::default(),
|
||||
permissions: Default::default(),
|
||||
ttl: None,
|
||||
flow_dirs: vec![],
|
||||
on_exit: shipote_card::ExitPolicy::Reap,
|
||||
};
|
||||
let (id, _) = mgr.create(spec).await.unwrap();
|
||||
// /bin/false sale con exit=1. Con restart_on_failure=true debería
|
||||
// relanzarse al menos 1 vez (tras el backoff inicial de 200ms).
|
||||
let summary = mgr
|
||||
.run_with_options(id, "/bin/false".into(), vec![], vec![], true)
|
||||
.await
|
||||
.unwrap();
|
||||
let original_id = summary.id;
|
||||
// Esperamos ~500ms para que termine + reap + restart corra.
|
||||
for _ in 0..30 {
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
mgr.reap_dead().await;
|
||||
let g = mgr.inner.lock().await;
|
||||
if let Some(ws) = g.workspaces.get(&id) {
|
||||
let new_cmds: Vec<_> = ws.commands.keys().filter(|k| **k != original_id).collect();
|
||||
if !new_cmds.is_empty() {
|
||||
// Hay un nuevo command_id → restart funcionó.
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
panic!("restart never launched a new command");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn run_true_in_workspace() {
|
||||
let mgr = Arc::new(WorkspaceManager::new(IncarnatorConfig::default()));
|
||||
|
||||
Reference in New Issue
Block a user