feat(shipote): drain shutdown + persist live pipelines + batched query (fase N)
- Daemon SIGTERM/SIGINT: snapshot ANTES, stop_with_grace(1s) de todos los workspaces DESPUÉS. Grace permite app-level cleanup. - Snapshot v3 con live_pipelines: pipeline_supervisors se persisten; daemon relanza al restore con sus recursos (Incarnator+DiscernPipeline). RestoreOutcome separado para que core no necesite incarnator. Forward-compat v1/v2 via #[serde(default)]. - WorkspaceFullSummary: stats+quota+commands+flow_sockets en 1 roundtrip. Shell reduce N×4 requests/probe a N×1 + 4 globales. 83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 24, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -10,8 +10,10 @@ use shipote_card::{PipelineSpec, WorkspaceId, WorkspaceSpec};
|
||||
use std::path::{Path, PathBuf};
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// v2 agregó `saved_pipelines`. v1 lee con campo ausente como vacío.
|
||||
pub const SNAPSHOT_VERSION: u16 = 2;
|
||||
/// v2 agregó `saved_pipelines`. v3 agrega `live_pipelines` (pipelines
|
||||
/// con supervisor vivo al momento del snapshot — el daemon los relanza
|
||||
/// al restore). Versiones inferiores leen campos ausentes como vacío.
|
||||
pub const SNAPSHOT_VERSION: u16 = 3;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ShipoteSnapshot {
|
||||
@@ -20,6 +22,10 @@ pub struct ShipoteSnapshot {
|
||||
pub workspaces: Vec<WorkspaceEntry>,
|
||||
#[serde(default)]
|
||||
pub saved_pipelines: Vec<PipelineEntry>,
|
||||
/// Pipelines vivos con supervisor (`restart_on_failure=true`) al
|
||||
/// momento del snapshot. El daemon los relanza al restore.
|
||||
#[serde(default)]
|
||||
pub live_pipelines: Vec<LivePipelineEntry>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -34,6 +40,13 @@ pub struct PipelineEntry {
|
||||
pub spec: PipelineSpec,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LivePipelineEntry {
|
||||
pub workspace: WorkspaceId,
|
||||
pub spec: PipelineSpec,
|
||||
pub tap: bool,
|
||||
}
|
||||
|
||||
impl ShipoteSnapshot {
|
||||
pub fn write(&self, path: &Path) -> anyhow::Result<()> {
|
||||
let bytes = serde_json::to_vec_pretty(self)?;
|
||||
@@ -102,11 +115,24 @@ impl WorkspaceManager {
|
||||
spec: spec.clone(),
|
||||
})
|
||||
.collect();
|
||||
// Pipelines vivos con supervisor — preserva la intención. Los
|
||||
// pids/sockets/discernments son ephemeral y se regeneran al
|
||||
// restore (relaunch desde cero).
|
||||
let live_pipelines = g
|
||||
.pipeline_supervisors
|
||||
.values()
|
||||
.map(|sup| LivePipelineEntry {
|
||||
workspace: sup.workspace,
|
||||
spec: sup.spec.clone(),
|
||||
tap: sup.tap,
|
||||
})
|
||||
.collect();
|
||||
ShipoteSnapshot {
|
||||
version: SNAPSHOT_VERSION,
|
||||
timestamp_ms: now_ms(),
|
||||
workspaces,
|
||||
saved_pipelines,
|
||||
live_pipelines,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,34 +144,57 @@ impl WorkspaceManager {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Carga snapshot desde disco y restaura los Workspaces.
|
||||
/// Carga snapshot desde disco y restaura los Workspaces + saved
|
||||
/// pipelines. Devuelve los `live_pipelines` para que el caller
|
||||
/// (daemon) los relance — no podemos relanzarlos desde acá porque
|
||||
/// `run_pipeline` necesita `Incarnator` + `DiscernPipeline`.
|
||||
/// Errores no-fatales (workspaces inválidos) se loguean y se saltan.
|
||||
pub async fn restore_snapshot(self: &std::sync::Arc<Self>, path: &Path) -> anyhow::Result<usize> {
|
||||
pub async fn restore_snapshot(
|
||||
self: &std::sync::Arc<Self>,
|
||||
path: &Path,
|
||||
) -> anyhow::Result<RestoreOutcome> {
|
||||
let snap = match ShipoteSnapshot::read(path) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
warn!(?e, path = %path.display(), "no snapshot — start fresh");
|
||||
return Ok(0);
|
||||
return Ok(RestoreOutcome::default());
|
||||
}
|
||||
};
|
||||
let mut restored = 0usize;
|
||||
let mut out = RestoreOutcome::default();
|
||||
for entry in snap.workspaces {
|
||||
// v2+: reusamos el id original así clients que tracking
|
||||
// workspace_id no se rompen al restart.
|
||||
let label = entry.spec.label.clone();
|
||||
match self.create_with_id(entry.id, entry.spec).await {
|
||||
Ok(_) => restored += 1,
|
||||
Ok(_) => out.workspaces_restored += 1,
|
||||
Err(e) => warn!(?e, %label, "skipped workspace en restore"),
|
||||
}
|
||||
}
|
||||
for entry in snap.saved_pipelines {
|
||||
self.save_pipeline(entry.name, entry.spec).await;
|
||||
out.saved_pipelines_restored += 1;
|
||||
}
|
||||
info!(restored, "snapshot restored");
|
||||
Ok(restored)
|
||||
out.live_pipelines = snap.live_pipelines;
|
||||
info!(
|
||||
workspaces = out.workspaces_restored,
|
||||
saved_pipelines = out.saved_pipelines_restored,
|
||||
live_pipelines = out.live_pipelines.len(),
|
||||
"snapshot restored"
|
||||
);
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
/// Lo que el caller del restore obtiene. Las `live_pipelines` requieren
|
||||
/// `Incarnator + DiscernPipeline` para relanzarlas → el caller las
|
||||
/// procesa (típicamente el daemon).
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RestoreOutcome {
|
||||
pub workspaces_restored: usize,
|
||||
pub saved_pipelines_restored: usize,
|
||||
pub live_pipelines: Vec<LivePipelineEntry>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -177,8 +226,8 @@ mod tests {
|
||||
mgr1.save_snapshot(&path).await.unwrap();
|
||||
|
||||
let mgr2 = Arc::new(WorkspaceManager::new(IncarnatorConfig::default()));
|
||||
let n = mgr2.restore_snapshot(&path).await.unwrap();
|
||||
assert_eq!(n, 2);
|
||||
let out = mgr2.restore_snapshot(&path).await.unwrap();
|
||||
assert_eq!(out.workspaces_restored, 2);
|
||||
let listed = mgr2.list().await;
|
||||
let restored_ids: std::collections::HashSet<_> = listed.iter().map(|s| s.id).collect();
|
||||
assert!(restored_ids.contains(&id1));
|
||||
|
||||
Reference in New Issue
Block a user