feat(shipote): drain shutdown + persist live pipelines + batched query (fase N)

- Daemon SIGTERM/SIGINT: snapshot ANTES, stop_with_grace(1s) de todos
  los workspaces DESPUÉS. Grace permite app-level cleanup.
- Snapshot v3 con live_pipelines: pipeline_supervisors se persisten;
  daemon relanza al restore con sus recursos (Incarnator+DiscernPipeline).
  RestoreOutcome separado para que core no necesite incarnator.
  Forward-compat v1/v2 via #[serde(default)].
- WorkspaceFullSummary: stats+quota+commands+flow_sockets en 1 roundtrip.
  Shell reduce N×4 requests/probe a N×1 + 4 globales.

83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8,
shipote-core 24, shipote-discern 5, yahweh-provider-fs 3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
sergio
2026-05-11 10:48:11 +00:00
parent c3f9c9e36a
commit a823c40fe1
4 changed files with 185 additions and 47 deletions
+103 -9
View File
@@ -64,14 +64,43 @@ async fn main() -> anyhow::Result<()> {
}));
// Restaurar snapshot previo si existe. Workspaces se recrean; los
// pids de comandos viejos NO se recuperan (kernel los mató).
// pids de comandos viejos NO se recuperan (kernel los mató). Los
// pipelines vivos (con supervisor) se relanzan desde cero.
let snapshot_path = shipote_core::persist::default_snapshot_path();
if let Err(e) = mgr.restore_snapshot(&snapshot_path).await {
warn!(?e, "restore_snapshot falló — start fresh");
let restore = match mgr.restore_snapshot(&snapshot_path).await {
Ok(r) => r,
Err(e) => {
warn!(?e, "restore_snapshot falló — start fresh");
shipote_core::persist::RestoreOutcome::default()
}
};
// Relauncher de live_pipelines: como necesita inc+disc del daemon,
// lo hacemos acá tras el restore. Cada uno mismo flujo que un run
// normal — register_pipeline_commands + register_pipeline_supervisor.
for entry in restore.live_pipelines {
let inc = mgr.incarnator_handle();
let disc = Arc::new(DiscernPipeline::default_pipeline());
let workspace = entry.workspace;
let ws_label = mgr.workspace_label(workspace).await.unwrap_or_default();
let tap = entry.tap;
let spec = entry.spec;
match shipote_core::pipeline::run_pipeline(
&spec, &ws_label, tap, disc, inc, Some(mgr.clone()),
)
.await
{
Ok(launch) => {
mgr.register_pipeline_commands(workspace, launch.pipeline, launch.command_pids.clone()).await;
mgr.register_pipeline_supervisor(launch.pipeline, workspace, spec, tap).await;
info!(label = %launch.pipeline, "live pipeline relaunched from snapshot");
}
Err(e) => warn!(?e, "live pipeline relaunch failed"),
}
}
// Save-on-shutdown via SIGTERM/SIGINT handler. tokio::signal soporta
// ambos en Linux.
// Shutdown handler: SIGTERM/SIGINT → drain (stop_with_grace de todos
// los workspaces) → snapshot → exit. El drain usa grace=1s para dar
// chance a los comandos a terminar limpio antes del SIGKILL.
{
let mgr = mgr.clone();
let path = snapshot_path.clone();
@@ -80,13 +109,32 @@ async fn main() -> anyhow::Result<()> {
.expect("SIGTERM handler");
let mut int = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::interrupt())
.expect("SIGINT handler");
tokio::select! {
_ = term.recv() => info!("SIGTERM — saving snapshot"),
_ = int.recv() => info!("SIGINT — saving snapshot"),
}
let sig_name = tokio::select! {
_ = term.recv() => "SIGTERM",
_ = int.recv() => "SIGINT",
};
info!(signal = sig_name, "shipote-daemon shutdown: draining workspaces");
// 1) Snapshot ANTES del drain — preserva intención declarada
// (los workspace specs siguen vivos en el snapshot aunque
// matemos los procesos hijos).
if let Err(e) = mgr.save_snapshot(&path).await {
warn!(?e, "save_snapshot falló");
}
// 2) Drain: stop_with_grace de todos los workspaces vivos.
// Grace 1s da chance a apps Type=notify de hacer cleanup.
let workspaces = mgr.list().await;
let n = workspaces.len();
for ws in workspaces {
if let Err(e) = mgr
.stop_with_grace(ws.id, std::time::Duration::from_millis(1000))
.await
{
warn!(?e, %ws.id, "stop_with_grace falló en drain");
}
}
info!(drained = n, "drain complete");
std::process::exit(0);
});
}
@@ -417,6 +465,52 @@ async fn dispatch(
},
},
Request::WorkspaceFullSummary { workspace } => {
let stats = match mgr.workspace_stats(workspace).await {
Some(s) => WorkspaceStatsInfo {
commands_alive: s.commands_alive,
commands_total: s.commands_total,
rss_bytes: s.rss_bytes,
rss_peak_bytes: s.rss_peak_bytes,
cpu_usec: s.cpu_usec,
cpu_percent: s.cpu_percent,
cpu_cores: s.cpu_cores,
source: s.source,
uptime_ms: s.uptime_ms,
},
None => return Response::Error { message: format!("workspace {workspace} not found") },
};
let quota = match mgr.workspace_quota(workspace).await {
Some(q) => QuotaReportInfo {
mem_limit: q.mem_limit,
nproc_limit: q.nproc_limit,
breaches: q.breaches,
},
None => QuotaReportInfo { mem_limit: None, nproc_limit: None, breaches: Vec::new() },
};
let commands = mgr
.list_commands(workspace)
.await
.into_iter()
.map(|c| ProtoCommandInfo {
id: c.id,
label: c.label,
pid: c.pid,
alive: c.alive,
exit_status: c.exit_status,
log_bytes: c.log_bytes,
})
.collect();
// Flow sockets de pipelines whose workspace == este.
let flow_sockets = mgr
.list_flow_pipelines()
.await
.into_iter()
.flat_map(|(_, sockets)| sockets)
.collect();
Response::WorkspaceFullSummary { stats, quota, commands, flow_sockets }
}
Request::WorkspaceQuota { workspace } => match mgr.workspace_quota(workspace).await {
Some(q) => Response::WorkspaceQuota {
info: QuotaReportInfo {
+10 -27
View File
@@ -174,42 +174,25 @@ fn probe_blocking(path: &std::path::Path) -> Result<Snapshot, String> {
other => return Err(format!("unexpected list resp: {other:?}")),
};
// Commands por workspace.
// Batched: stats+quota+commands+flow_sockets en 1 roundtrip por ws.
let mut commands_map = std::collections::BTreeMap::new();
let mut fresh_stats = std::collections::BTreeMap::new();
let mut quotas = std::collections::BTreeMap::new();
for w in &workspaces {
write_frame(&mut stream, &Request::CommandList { workspace: w.id })
write_frame(&mut stream, &Request::WorkspaceFullSummary { workspace: w.id })
.await
.map_err(|e| format!("write commands: {e}"))?;
.map_err(|e| format!("write summary: {e}"))?;
let resp: Response = read_frame(&mut stream)
.await
.map_err(|e| format!("read commands: {e}"))?;
if let Response::CommandList { items } = resp {
if !items.is_empty() {
commands_map.insert(w.id.to_string(), items);
.map_err(|e| format!("read summary: {e}"))?;
if let Response::WorkspaceFullSummary { stats, quota, commands, .. } = resp {
let key = w.id.to_string();
fresh_stats.insert(key.clone(), stats);
quotas.insert(key.clone(), quota);
if !commands.is_empty() {
commands_map.insert(key, commands);
}
}
// Stats por workspace.
write_frame(&mut stream, &Request::WorkspaceStats { workspace: w.id })
.await
.map_err(|e| format!("write stats: {e}"))?;
let resp: Response = read_frame(&mut stream)
.await
.map_err(|e| format!("read stats: {e}"))?;
if let Response::WorkspaceStats { info } = resp {
fresh_stats.insert(w.id.to_string(), info);
}
// Quota por workspace.
write_frame(&mut stream, &Request::WorkspaceQuota { workspace: w.id })
.await
.map_err(|e| format!("write quota: {e}"))?;
let resp: Response = read_frame(&mut stream)
.await
.map_err(|e| format!("read quota: {e}"))?;
if let Response::WorkspaceQuota { info } = resp {
quotas.insert(w.id.to_string(), info);
}
}
// Saved pipelines.