feat(shipote): drain shutdown + persist live pipelines + batched query (fase N)
- Daemon SIGTERM/SIGINT: snapshot ANTES, stop_with_grace(1s) de todos los workspaces DESPUÉS. Grace permite app-level cleanup. - Snapshot v3 con live_pipelines: pipeline_supervisors se persisten; daemon relanza al restore con sus recursos (Incarnator+DiscernPipeline). RestoreOutcome separado para que core no necesite incarnator. Forward-compat v1/v2 via #[serde(default)]. - WorkspaceFullSummary: stats+quota+commands+flow_sockets en 1 roundtrip. Shell reduce N×4 requests/probe a N×1 + 4 globales. 83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 24, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -64,14 +64,43 @@ async fn main() -> anyhow::Result<()> {
|
||||
}));
|
||||
|
||||
// Restaurar snapshot previo si existe. Workspaces se recrean; los
|
||||
// pids de comandos viejos NO se recuperan (kernel los mató).
|
||||
// pids de comandos viejos NO se recuperan (kernel los mató). Los
|
||||
// pipelines vivos (con supervisor) se relanzan desde cero.
|
||||
let snapshot_path = shipote_core::persist::default_snapshot_path();
|
||||
if let Err(e) = mgr.restore_snapshot(&snapshot_path).await {
|
||||
warn!(?e, "restore_snapshot falló — start fresh");
|
||||
let restore = match mgr.restore_snapshot(&snapshot_path).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
warn!(?e, "restore_snapshot falló — start fresh");
|
||||
shipote_core::persist::RestoreOutcome::default()
|
||||
}
|
||||
};
|
||||
// Relauncher de live_pipelines: como necesita inc+disc del daemon,
|
||||
// lo hacemos acá tras el restore. Cada uno mismo flujo que un run
|
||||
// normal — register_pipeline_commands + register_pipeline_supervisor.
|
||||
for entry in restore.live_pipelines {
|
||||
let inc = mgr.incarnator_handle();
|
||||
let disc = Arc::new(DiscernPipeline::default_pipeline());
|
||||
let workspace = entry.workspace;
|
||||
let ws_label = mgr.workspace_label(workspace).await.unwrap_or_default();
|
||||
let tap = entry.tap;
|
||||
let spec = entry.spec;
|
||||
match shipote_core::pipeline::run_pipeline(
|
||||
&spec, &ws_label, tap, disc, inc, Some(mgr.clone()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(launch) => {
|
||||
mgr.register_pipeline_commands(workspace, launch.pipeline, launch.command_pids.clone()).await;
|
||||
mgr.register_pipeline_supervisor(launch.pipeline, workspace, spec, tap).await;
|
||||
info!(label = %launch.pipeline, "live pipeline relaunched from snapshot");
|
||||
}
|
||||
Err(e) => warn!(?e, "live pipeline relaunch failed"),
|
||||
}
|
||||
}
|
||||
|
||||
// Save-on-shutdown via SIGTERM/SIGINT handler. tokio::signal soporta
|
||||
// ambos en Linux.
|
||||
// Shutdown handler: SIGTERM/SIGINT → drain (stop_with_grace de todos
|
||||
// los workspaces) → snapshot → exit. El drain usa grace=1s para dar
|
||||
// chance a los comandos a terminar limpio antes del SIGKILL.
|
||||
{
|
||||
let mgr = mgr.clone();
|
||||
let path = snapshot_path.clone();
|
||||
@@ -80,13 +109,32 @@ async fn main() -> anyhow::Result<()> {
|
||||
.expect("SIGTERM handler");
|
||||
let mut int = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::interrupt())
|
||||
.expect("SIGINT handler");
|
||||
tokio::select! {
|
||||
_ = term.recv() => info!("SIGTERM — saving snapshot"),
|
||||
_ = int.recv() => info!("SIGINT — saving snapshot"),
|
||||
}
|
||||
let sig_name = tokio::select! {
|
||||
_ = term.recv() => "SIGTERM",
|
||||
_ = int.recv() => "SIGINT",
|
||||
};
|
||||
info!(signal = sig_name, "shipote-daemon shutdown: draining workspaces");
|
||||
|
||||
// 1) Snapshot ANTES del drain — preserva intención declarada
|
||||
// (los workspace specs siguen vivos en el snapshot aunque
|
||||
// matemos los procesos hijos).
|
||||
if let Err(e) = mgr.save_snapshot(&path).await {
|
||||
warn!(?e, "save_snapshot falló");
|
||||
}
|
||||
|
||||
// 2) Drain: stop_with_grace de todos los workspaces vivos.
|
||||
// Grace 1s da chance a apps Type=notify de hacer cleanup.
|
||||
let workspaces = mgr.list().await;
|
||||
let n = workspaces.len();
|
||||
for ws in workspaces {
|
||||
if let Err(e) = mgr
|
||||
.stop_with_grace(ws.id, std::time::Duration::from_millis(1000))
|
||||
.await
|
||||
{
|
||||
warn!(?e, %ws.id, "stop_with_grace falló en drain");
|
||||
}
|
||||
}
|
||||
info!(drained = n, "drain complete");
|
||||
std::process::exit(0);
|
||||
});
|
||||
}
|
||||
@@ -417,6 +465,52 @@ async fn dispatch(
|
||||
},
|
||||
},
|
||||
|
||||
Request::WorkspaceFullSummary { workspace } => {
|
||||
let stats = match mgr.workspace_stats(workspace).await {
|
||||
Some(s) => WorkspaceStatsInfo {
|
||||
commands_alive: s.commands_alive,
|
||||
commands_total: s.commands_total,
|
||||
rss_bytes: s.rss_bytes,
|
||||
rss_peak_bytes: s.rss_peak_bytes,
|
||||
cpu_usec: s.cpu_usec,
|
||||
cpu_percent: s.cpu_percent,
|
||||
cpu_cores: s.cpu_cores,
|
||||
source: s.source,
|
||||
uptime_ms: s.uptime_ms,
|
||||
},
|
||||
None => return Response::Error { message: format!("workspace {workspace} not found") },
|
||||
};
|
||||
let quota = match mgr.workspace_quota(workspace).await {
|
||||
Some(q) => QuotaReportInfo {
|
||||
mem_limit: q.mem_limit,
|
||||
nproc_limit: q.nproc_limit,
|
||||
breaches: q.breaches,
|
||||
},
|
||||
None => QuotaReportInfo { mem_limit: None, nproc_limit: None, breaches: Vec::new() },
|
||||
};
|
||||
let commands = mgr
|
||||
.list_commands(workspace)
|
||||
.await
|
||||
.into_iter()
|
||||
.map(|c| ProtoCommandInfo {
|
||||
id: c.id,
|
||||
label: c.label,
|
||||
pid: c.pid,
|
||||
alive: c.alive,
|
||||
exit_status: c.exit_status,
|
||||
log_bytes: c.log_bytes,
|
||||
})
|
||||
.collect();
|
||||
// Flow sockets de pipelines whose workspace == este.
|
||||
let flow_sockets = mgr
|
||||
.list_flow_pipelines()
|
||||
.await
|
||||
.into_iter()
|
||||
.flat_map(|(_, sockets)| sockets)
|
||||
.collect();
|
||||
Response::WorkspaceFullSummary { stats, quota, commands, flow_sockets }
|
||||
}
|
||||
|
||||
Request::WorkspaceQuota { workspace } => match mgr.workspace_quota(workspace).await {
|
||||
Some(q) => Response::WorkspaceQuota {
|
||||
info: QuotaReportInfo {
|
||||
|
||||
@@ -174,42 +174,25 @@ fn probe_blocking(path: &std::path::Path) -> Result<Snapshot, String> {
|
||||
other => return Err(format!("unexpected list resp: {other:?}")),
|
||||
};
|
||||
|
||||
// Commands por workspace.
|
||||
// Batched: stats+quota+commands+flow_sockets en 1 roundtrip por ws.
|
||||
let mut commands_map = std::collections::BTreeMap::new();
|
||||
let mut fresh_stats = std::collections::BTreeMap::new();
|
||||
let mut quotas = std::collections::BTreeMap::new();
|
||||
for w in &workspaces {
|
||||
write_frame(&mut stream, &Request::CommandList { workspace: w.id })
|
||||
write_frame(&mut stream, &Request::WorkspaceFullSummary { workspace: w.id })
|
||||
.await
|
||||
.map_err(|e| format!("write commands: {e}"))?;
|
||||
.map_err(|e| format!("write summary: {e}"))?;
|
||||
let resp: Response = read_frame(&mut stream)
|
||||
.await
|
||||
.map_err(|e| format!("read commands: {e}"))?;
|
||||
if let Response::CommandList { items } = resp {
|
||||
if !items.is_empty() {
|
||||
commands_map.insert(w.id.to_string(), items);
|
||||
.map_err(|e| format!("read summary: {e}"))?;
|
||||
if let Response::WorkspaceFullSummary { stats, quota, commands, .. } = resp {
|
||||
let key = w.id.to_string();
|
||||
fresh_stats.insert(key.clone(), stats);
|
||||
quotas.insert(key.clone(), quota);
|
||||
if !commands.is_empty() {
|
||||
commands_map.insert(key, commands);
|
||||
}
|
||||
}
|
||||
// Stats por workspace.
|
||||
write_frame(&mut stream, &Request::WorkspaceStats { workspace: w.id })
|
||||
.await
|
||||
.map_err(|e| format!("write stats: {e}"))?;
|
||||
let resp: Response = read_frame(&mut stream)
|
||||
.await
|
||||
.map_err(|e| format!("read stats: {e}"))?;
|
||||
if let Response::WorkspaceStats { info } = resp {
|
||||
fresh_stats.insert(w.id.to_string(), info);
|
||||
}
|
||||
// Quota por workspace.
|
||||
write_frame(&mut stream, &Request::WorkspaceQuota { workspace: w.id })
|
||||
.await
|
||||
.map_err(|e| format!("write quota: {e}"))?;
|
||||
let resp: Response = read_frame(&mut stream)
|
||||
.await
|
||||
.map_err(|e| format!("read quota: {e}"))?;
|
||||
if let Response::WorkspaceQuota { info } = resp {
|
||||
quotas.insert(w.id.to_string(), info);
|
||||
}
|
||||
}
|
||||
|
||||
// Saved pipelines.
|
||||
|
||||
Reference in New Issue
Block a user