feat(shipote): pipeline backoff + quota card + logs follow (fase M)

- PipelineSpec.restart_backoff_ms + restart_max_backoff_ms + restart_max: backoff exponencial entre relaunches (anti-thrash). take_pending_restarts aplica restart_max (0 = infinito); excedido = supervisor descartado con warning. Daemon hace tokio::sleep(backoff) antes del relaunch y escala current_backoff x2 hasta el cap. - shipote-shell card "Quota breaches": probe extiende con WorkspaceQuota por workspace. Color rojo si hay breaches, verde si no. - shipote logs --follow: poll cada 200ms al daemon, imprime suffix nuevo hasta que el comando termine. Sin cambios al protocolo. Best-effort: si el ring rota más rápido que el poll, se pierden bytes. 83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 24, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-11 10:34:27 +00:00
parent 4c9d1b4c1d
commit c3f9c9e36a
7 changed files with 236 additions and 38 deletions
@@ -104,23 +104,25 @@ async fn main() -> anyhow::Result<()> {
                mgr.reap_dead().await;
                let pending = mgr.take_pending_restarts().await;
                for sup in pending {
+                    let backoff = std::time::Duration::from_millis(sup.current_backoff_ms);
                    info!(
                        label = %sup.spec.label,
                        restart_count = sup.restart_count,
-                        "pipeline restart: relaunching"
+                        backoff_ms = sup.current_backoff_ms,
+                        "pipeline restart: relaunching after backoff"
                    );
+                    // Backoff antes del relaunch — anti-thrash.
+                    tokio::time::sleep(backoff).await;
                    let inc = mgr.incarnator_handle();
                    let disc = std::sync::Arc::new(DiscernPipeline::default_pipeline());
-                    let ws_label = mgr
-                        .workspace_label(sup.spec.workspace)
-                        .await
-                        .unwrap_or_default();
-                    let restart_count = sup.restart_count;
                    let workspace = sup.spec.workspace;
+                    let ws_label = mgr.workspace_label(workspace).await.unwrap_or_default();
                    let tap = sup.tap;
                    let mut new_spec = sup.spec.clone();
-                    // Mantener restart_on_failure para futuras fallas.
                    new_spec.restart_on_failure = true;
+                    // Escalar el backoff para la PRÓXIMA falla.
+                    let next_backoff = (sup.current_backoff_ms * 2)
+                        .min(new_spec.restart_max_backoff_ms);
                    match shipote_core::pipeline::run_pipeline(
                        &new_spec,
                        &ws_label,
@@ -132,19 +134,23 @@ async fn main() -> anyhow::Result<()> {
                    .await
                    {
                        Ok(launch) => {
-                            mgr.register_pipeline_commands(workspace, launch.pipeline, launch.command_pids.clone())
-                                .await;
-                            // Re-registrar supervisor con el nuevo pipeline_id,
-                            // preservando restart_count.
-                            let mut s = shipote_core::PipelineSupervisor {
+                            mgr.register_pipeline_commands(
                                workspace,
-                                spec: new_spec,
+                                launch.pipeline,
+                                launch.command_pids.clone(),
+                            )
+                            .await;
+                            // Re-registrar supervisor con backoff escalado +
+                            // restart_count preservado.
+                            mgr.register_pipeline_supervisor_with_state(
+                                launch.pipeline,
+                                workspace,
+                                new_spec,
                                tap,
-                                restart_count,
-                            };
-                            s.restart_count = restart_count;
-                            mgr.register_pipeline_supervisor(launch.pipeline, workspace, s.spec.clone(), tap)
-                                .await;
+                                sup.restart_count,
+                                next_backoff,
+                            )
+                            .await;
                        }
                        Err(e) => {
                            warn!(?e, "pipeline restart failed");