feat(shipote): pipeline backoff + quota card + logs follow (fase M)

- PipelineSpec.restart_backoff_ms + restart_max_backoff_ms + restart_max:
  backoff exponencial entre relaunches (anti-thrash). take_pending_restarts
  aplica restart_max (0 = infinito); excedido = supervisor descartado con
  warning. Daemon hace tokio::sleep(backoff) antes del relaunch y escala
  current_backoff x2 hasta el cap.
- shipote-shell card "Quota breaches": probe extiende con WorkspaceQuota
  por workspace. Color rojo si hay breaches, verde si no.
- shipote logs --follow: poll cada 200ms al daemon, imprime suffix nuevo
  hasta que el comando termine. Sin cambios al protocolo. Best-effort:
  si el ring rota más rápido que el poll, se pierden bytes.

83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8,
shipote-core 24, shipote-discern 5, yahweh-provider-fs 3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
sergio
2026-05-11 10:34:27 +00:00
parent 4c9d1b4c1d
commit c3f9c9e36a
7 changed files with 236 additions and 38 deletions
@@ -114,6 +114,8 @@ pub struct PipelineSupervisor {
pub spec: PipelineSpec,
pub tap: bool,
pub restart_count: u32,
/// Backoff actual (ms) — escala exponencialmente con cada restart.
pub current_backoff_ms: u64,
}
#[derive(Debug, Clone)]
@@ -248,6 +250,7 @@ impl WorkspaceManager {
}
tracing::debug!(%pipeline_id, label = %spec.label, "pipeline supervisor registered");
let mut g = self.inner.lock().await;
let initial_backoff = spec.restart_backoff_ms.max(50);
g.pipeline_supervisors.insert(
pipeline_id,
PipelineSupervisor {
@@ -255,18 +258,60 @@ impl WorkspaceManager {
spec,
tap,
restart_count: 0,
current_backoff_ms: initial_backoff,
},
);
}
/// Variante que preserva backoff/count del supervisor anterior (para
/// re-registrar tras un restart sin perder el throttle acumulado).
pub async fn register_pipeline_supervisor_with_state(
&self,
pipeline_id: Ulid,
workspace: WorkspaceId,
spec: PipelineSpec,
tap: bool,
restart_count: u32,
current_backoff_ms: u64,
) {
if !spec.restart_on_failure {
return;
}
let mut g = self.inner.lock().await;
g.pipeline_supervisors.insert(
pipeline_id,
PipelineSupervisor {
workspace,
spec,
tap,
restart_count,
current_backoff_ms,
},
);
}
/// Drena la cola de pipelines pendientes de restart y retorna las
/// specs a relaunch. El daemon lo llama tras cada `reap_dead`.
///
/// Aplica `restart_max`: si el supervisor ya pasó el límite, no se
/// retorna y el supervisor se elimina (give-up). El backoff
/// preserva el valor actual; el daemon decide cuándo aplicar el
/// sleep antes del relaunch.
pub async fn take_pending_restarts(&self) -> Vec<PipelineSupervisor> {
let mut g = self.inner.lock().await;
let pending = std::mem::take(&mut g.pending_pipeline_restarts);
let mut out = Vec::with_capacity(pending.len());
for old_id in pending {
if let Some(mut sup) = g.pipeline_supervisors.remove(&old_id) {
if sup.spec.restart_max > 0 && sup.restart_count >= sup.spec.restart_max {
tracing::warn!(
label = %sup.spec.label,
restart_count = sup.restart_count,
max = sup.spec.restart_max,
"pipeline restart_max reached — giving up"
);
continue; // no relaunch, supervisor discarded.
}
sup.restart_count += 1;
out.push(sup);
}
@@ -1254,6 +1299,9 @@ mod tests {
edges: vec![],
discern: DiscernPolicy::default(),
restart_on_failure: true,
restart_backoff_ms: 200,
restart_max_backoff_ms: 30_000,
restart_max: 0,
};
let pipeline_id = ulid::Ulid::new();
// Simulamos lo que haría el daemon: registramos un comando como
@@ -210,6 +210,9 @@ mod tests {
edges: vec![],
discern: DiscernPolicy::default(),
restart_on_failure: false,
restart_backoff_ms: 200,
restart_max_backoff_ms: 30_000,
restart_max: 0,
};
mgr1.save_pipeline("daily".into(), spec).await;
mgr1.save_snapshot(&path).await.unwrap();
@@ -595,6 +595,9 @@ mod tests {
}],
discern: DiscernPolicy::default(),
restart_on_failure: false,
restart_backoff_ms: 200,
restart_max_backoff_ms: 30_000,
restart_max: 0,
};
let disc = Arc::new(DiscernPipeline::default_pipeline());
let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));
@@ -633,6 +636,9 @@ mod tests {
],
discern: DiscernPolicy::default(),
restart_on_failure: false,
restart_backoff_ms: 200,
restart_max_backoff_ms: 30_000,
restart_max: 0,
};
let disc = Arc::new(DiscernPipeline::default_pipeline());
let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));
@@ -670,6 +676,9 @@ mod tests {
],
discern: DiscernPolicy::default(),
restart_on_failure: false,
restart_backoff_ms: 200,
restart_max_backoff_ms: 30_000,
restart_max: 0,
};
let disc = Arc::new(DiscernPipeline::default_pipeline());
let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));
@@ -702,6 +711,9 @@ mod tests {
replay_bytes: 0,
},
restart_on_failure: false,
restart_backoff_ms: 200,
restart_max_backoff_ms: 30_000,
restart_max: 0,
};
let disc = Arc::new(DiscernPipeline::default_pipeline());
let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));