feat(shipote): drain shutdown + persist live pipelines + batched query (fase N)

- Daemon SIGTERM/SIGINT: snapshot ANTES, stop_with_grace(1s) de todos
  los workspaces DESPUÉS. Grace permite app-level cleanup.
- Snapshot v3 con live_pipelines: pipeline_supervisors se persisten;
  daemon relanza al restore con sus recursos (Incarnator+DiscernPipeline).
  RestoreOutcome separado para que core no necesite incarnator.
  Forward-compat v1/v2 via #[serde(default)].
- WorkspaceFullSummary: stats+quota+commands+flow_sockets en 1 roundtrip.
  Shell reduce N×4 requests/probe a N×1 + 4 globales.

83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8,
shipote-core 24, shipote-discern 5, yahweh-provider-fs 3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
sergio
2026-05-11 10:48:11 +00:00
parent c3f9c9e36a
commit a823c40fe1
4 changed files with 185 additions and 47 deletions
@@ -10,8 +10,10 @@ use shipote_card::{PipelineSpec, WorkspaceId, WorkspaceSpec};
use std::path::{Path, PathBuf};
use tracing::{info, warn};
/// v2 agregó `saved_pipelines`. v1 lee con campo ausente como vacío.
pub const SNAPSHOT_VERSION: u16 = 2;
/// v2 agregó `saved_pipelines`. v3 agrega `live_pipelines` (pipelines
/// con supervisor vivo al momento del snapshot — el daemon los relanza
/// al restore). Versiones inferiores leen campos ausentes como vacío.
pub const SNAPSHOT_VERSION: u16 = 3;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ShipoteSnapshot {
@@ -20,6 +22,10 @@ pub struct ShipoteSnapshot {
pub workspaces: Vec<WorkspaceEntry>,
#[serde(default)]
pub saved_pipelines: Vec<PipelineEntry>,
/// Pipelines vivos con supervisor (`restart_on_failure=true`) al
/// momento del snapshot. El daemon los relanza al restore.
#[serde(default)]
pub live_pipelines: Vec<LivePipelineEntry>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -34,6 +40,13 @@ pub struct PipelineEntry {
pub spec: PipelineSpec,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LivePipelineEntry {
pub workspace: WorkspaceId,
pub spec: PipelineSpec,
pub tap: bool,
}
impl ShipoteSnapshot {
pub fn write(&self, path: &Path) -> anyhow::Result<()> {
let bytes = serde_json::to_vec_pretty(self)?;
@@ -102,11 +115,24 @@ impl WorkspaceManager {
spec: spec.clone(),
})
.collect();
// Pipelines vivos con supervisor — preserva la intención. Los
// pids/sockets/discernments son ephemeral y se regeneran al
// restore (relaunch desde cero).
let live_pipelines = g
.pipeline_supervisors
.values()
.map(|sup| LivePipelineEntry {
workspace: sup.workspace,
spec: sup.spec.clone(),
tap: sup.tap,
})
.collect();
ShipoteSnapshot {
version: SNAPSHOT_VERSION,
timestamp_ms: now_ms(),
workspaces,
saved_pipelines,
live_pipelines,
}
}
@@ -118,34 +144,57 @@ impl WorkspaceManager {
Ok(())
}
/// Carga snapshot desde disco y restaura los Workspaces.
/// Carga snapshot desde disco y restaura los Workspaces + saved
/// pipelines. Devuelve los `live_pipelines` para que el caller
/// (daemon) los relance — no podemos relanzarlos desde acá porque
/// `run_pipeline` necesita `Incarnator` + `DiscernPipeline`.
/// Errores no-fatales (workspaces inválidos) se loguean y se saltan.
pub async fn restore_snapshot(self: &std::sync::Arc<Self>, path: &Path) -> anyhow::Result<usize> {
pub async fn restore_snapshot(
self: &std::sync::Arc<Self>,
path: &Path,
) -> anyhow::Result<RestoreOutcome> {
let snap = match ShipoteSnapshot::read(path) {
Ok(s) => s,
Err(e) => {
warn!(?e, path = %path.display(), "no snapshot — start fresh");
return Ok(0);
return Ok(RestoreOutcome::default());
}
};
let mut restored = 0usize;
let mut out = RestoreOutcome::default();
for entry in snap.workspaces {
// v2+: reusamos el id original así clients que tracking
// workspace_id no se rompen al restart.
let label = entry.spec.label.clone();
match self.create_with_id(entry.id, entry.spec).await {
Ok(_) => restored += 1,
Ok(_) => out.workspaces_restored += 1,
Err(e) => warn!(?e, %label, "skipped workspace en restore"),
}
}
for entry in snap.saved_pipelines {
self.save_pipeline(entry.name, entry.spec).await;
out.saved_pipelines_restored += 1;
}
info!(restored, "snapshot restored");
Ok(restored)
out.live_pipelines = snap.live_pipelines;
info!(
workspaces = out.workspaces_restored,
saved_pipelines = out.saved_pipelines_restored,
live_pipelines = out.live_pipelines.len(),
"snapshot restored"
);
Ok(out)
}
}
/// Lo que el caller del restore obtiene. Las `live_pipelines` requieren
/// `Incarnator + DiscernPipeline` para relanzarlas → el caller las
/// procesa (típicamente el daemon).
#[derive(Debug, Default)]
pub struct RestoreOutcome {
pub workspaces_restored: usize,
pub saved_pipelines_restored: usize,
pub live_pipelines: Vec<LivePipelineEntry>,
}
#[cfg(test)]
mod tests {
use super::*;
@@ -177,8 +226,8 @@ mod tests {
mgr1.save_snapshot(&path).await.unwrap();
let mgr2 = Arc::new(WorkspaceManager::new(IncarnatorConfig::default()));
let n = mgr2.restore_snapshot(&path).await.unwrap();
assert_eq!(n, 2);
let out = mgr2.restore_snapshot(&path).await.unwrap();
assert_eq!(out.workspaces_restored, 2);
let listed = mgr2.list().await;
let restored_ids: std::collections::HashSet<_> = listed.iter().map(|s| s.id).collect();
assert!(restored_ids.contains(&id1));
@@ -110,6 +110,11 @@ pub enum Request {
/// Reporte de quotas (rlimits declarados vs uso actual).
WorkspaceQuota { workspace: shipote_card::WorkspaceId },
/// Resumen completo de un workspace: stats + quota + commands +
/// flow sockets en una sola roundtrip. Reduce N×4 requests del
/// shell a N×1.
WorkspaceFullSummary { workspace: shipote_card::WorkspaceId },
/// Detener selectivamente los comandos de un pipeline (no el workspace
/// entero). `grace_ms`: SIGTERM → wait → SIGKILL.
PipelineStop {
@@ -205,6 +210,13 @@ pub enum Response {
info: QuotaReportInfo,
},
WorkspaceFullSummary {
stats: WorkspaceStatsInfo,
quota: QuotaReportInfo,
commands: Vec<CommandInfo>,
flow_sockets: Vec<PathBuf>,
},
FlowList {
items: Vec<FlowInfo>,
},