feat(shipote): pipeline backoff + quota card + logs follow (fase M)

- PipelineSpec.restart_backoff_ms + restart_max_backoff_ms + restart_max:
  backoff exponencial entre relaunches (anti-thrash). take_pending_restarts
  aplica restart_max (0 = infinito); excedido = supervisor descartado con
  warning. Daemon hace tokio::sleep(backoff) antes del relaunch y escala
  current_backoff x2 hasta el cap.
- shipote-shell card "Quota breaches": probe extiende con WorkspaceQuota
  por workspace. Color rojo si hay breaches, verde si no.
- shipote logs --follow: poll cada 200ms al daemon, imprime suffix nuevo
  hasta que el comando termine. Sin cambios al protocolo. Best-effort:
  si el ring rota más rápido que el poll, se pierden bytes.

83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8,
shipote-core 24, shipote-discern 5, yahweh-provider-fs 3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
sergio
2026-05-11 10:34:27 +00:00
parent 4c9d1b4c1d
commit c3f9c9e36a
7 changed files with 236 additions and 38 deletions
+24 -18
View File
@@ -104,23 +104,25 @@ async fn main() -> anyhow::Result<()> {
mgr.reap_dead().await;
let pending = mgr.take_pending_restarts().await;
for sup in pending {
let backoff = std::time::Duration::from_millis(sup.current_backoff_ms);
info!(
label = %sup.spec.label,
restart_count = sup.restart_count,
"pipeline restart: relaunching"
backoff_ms = sup.current_backoff_ms,
"pipeline restart: relaunching after backoff"
);
// Backoff antes del relaunch — anti-thrash.
tokio::time::sleep(backoff).await;
let inc = mgr.incarnator_handle();
let disc = std::sync::Arc::new(DiscernPipeline::default_pipeline());
let ws_label = mgr
.workspace_label(sup.spec.workspace)
.await
.unwrap_or_default();
let restart_count = sup.restart_count;
let workspace = sup.spec.workspace;
let ws_label = mgr.workspace_label(workspace).await.unwrap_or_default();
let tap = sup.tap;
let mut new_spec = sup.spec.clone();
// Mantener restart_on_failure para futuras fallas.
new_spec.restart_on_failure = true;
// Escalar el backoff para la PRÓXIMA falla.
let next_backoff = (sup.current_backoff_ms * 2)
.min(new_spec.restart_max_backoff_ms);
match shipote_core::pipeline::run_pipeline(
&new_spec,
&ws_label,
@@ -132,19 +134,23 @@ async fn main() -> anyhow::Result<()> {
.await
{
Ok(launch) => {
mgr.register_pipeline_commands(workspace, launch.pipeline, launch.command_pids.clone())
.await;
// Re-registrar supervisor con el nuevo pipeline_id,
// preservando restart_count.
let mut s = shipote_core::PipelineSupervisor {
mgr.register_pipeline_commands(
workspace,
spec: new_spec,
launch.pipeline,
launch.command_pids.clone(),
)
.await;
// Re-registrar supervisor con backoff escalado +
// restart_count preservado.
mgr.register_pipeline_supervisor_with_state(
launch.pipeline,
workspace,
new_spec,
tap,
restart_count,
};
s.restart_count = restart_count;
mgr.register_pipeline_supervisor(launch.pipeline, workspace, s.spec.clone(), tap)
.await;
sup.restart_count,
next_backoff,
)
.await;
}
Err(e) => {
warn!(?e, "pipeline restart failed");