feat(shipote): pipeline backoff + quota card + logs follow (fase M)

- PipelineSpec.restart_backoff_ms + restart_max_backoff_ms + restart_max: backoff exponencial entre relaunches (anti-thrash). take_pending_restarts aplica restart_max (0 = infinito); excedido = supervisor descartado con warning. Daemon hace tokio::sleep(backoff) antes del relaunch y escala current_backoff x2 hasta el cap. - shipote-shell card "Quota breaches": probe extiende con WorkspaceQuota por workspace. Color rojo si hay breaches, verde si no. - shipote logs --follow: poll cada 200ms al daemon, imprime suffix nuevo hasta que el comando termine. Sin cambios al protocolo. Best-effort: si el ring rota más rápido que el poll, se pierden bytes. 83 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 24, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-11 10:34:27 +00:00
parent 4c9d1b4c1d
commit c3f9c9e36a
7 changed files with 236 additions and 38 deletions
@@ -222,6 +222,25 @@ pub struct PipelineSpec {
    /// Útil para pipelines de procesamiento continuo.
    #[serde(default)]
    pub restart_on_failure: bool,
+    /// Backoff inicial entre restarts (ms). Crece exponencialmente
+    /// hasta `restart_max_backoff_ms`. Default 200ms = ~5 restarts/s
+    /// inicial, escalando rápido.
+    #[serde(default = "default_restart_backoff")]
+    pub restart_backoff_ms: u64,
+    /// Backoff máximo (ms). Default 30s. El backoff no crece más allá.
+    #[serde(default = "default_restart_max_backoff")]
+    pub restart_max_backoff_ms: u64,
+    /// Máximo de restarts antes de dar up. `0` = infinito. Default 0.
+    /// Útil para fail-loud cuando un pipeline siempre falla.
+    #[serde(default)]
+    pub restart_max: u32,
+}
+
+fn default_restart_backoff() -> u64 {
+    200
+}
+fn default_restart_max_backoff() -> u64 {
+    30_000
 }

 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -502,6 +521,9 @@ mod subst_tests {
            edges: vec![],
            discern: DiscernPolicy::default(),
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        let out = substitute_vars(&spec, &vars).unwrap();
        assert_eq!(out.label, "p-renamed");
@@ -522,6 +544,9 @@ mod subst_tests {
            edges: vec![],
            discern: DiscernPolicy::default(),
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        let out = substitute_vars(&spec, &vars).unwrap();
        assert_eq!(out.label, "p-${UNDEFINED}");
@@ -603,6 +628,9 @@ mod tests {
            }],
            discern: DiscernPolicy::default(),
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        assert!(p.validate().is_err());
    }
@@ -114,6 +114,8 @@ pub struct PipelineSupervisor {
    pub spec: PipelineSpec,
    pub tap: bool,
    pub restart_count: u32,
+    /// Backoff actual (ms) — escala exponencialmente con cada restart.
+    pub current_backoff_ms: u64,
 }

 #[derive(Debug, Clone)]
@@ -248,6 +250,7 @@ impl WorkspaceManager {
        }
        tracing::debug!(%pipeline_id, label = %spec.label, "pipeline supervisor registered");
        let mut g = self.inner.lock().await;
+        let initial_backoff = spec.restart_backoff_ms.max(50);
        g.pipeline_supervisors.insert(
            pipeline_id,
            PipelineSupervisor {
@@ -255,18 +258,60 @@ impl WorkspaceManager {
                spec,
                tap,
                restart_count: 0,
+                current_backoff_ms: initial_backoff,
+            },
+        );
+    }
+
+    /// Variante que preserva backoff/count del supervisor anterior (para
+    /// re-registrar tras un restart sin perder el throttle acumulado).
+    pub async fn register_pipeline_supervisor_with_state(
+        &self,
+        pipeline_id: Ulid,
+        workspace: WorkspaceId,
+        spec: PipelineSpec,
+        tap: bool,
+        restart_count: u32,
+        current_backoff_ms: u64,
+    ) {
+        if !spec.restart_on_failure {
+            return;
+        }
+        let mut g = self.inner.lock().await;
+        g.pipeline_supervisors.insert(
+            pipeline_id,
+            PipelineSupervisor {
+                workspace,
+                spec,
+                tap,
+                restart_count,
+                current_backoff_ms,
            },
        );
    }

    /// Drena la cola de pipelines pendientes de restart y retorna las
    /// specs a relaunch. El daemon lo llama tras cada `reap_dead`.
+    ///
+    /// Aplica `restart_max`: si el supervisor ya pasó el límite, no se
+    /// retorna y el supervisor se elimina (give-up). El backoff
+    /// preserva el valor actual; el daemon decide cuándo aplicar el
+    /// sleep antes del relaunch.
    pub async fn take_pending_restarts(&self) -> Vec<PipelineSupervisor> {
        let mut g = self.inner.lock().await;
        let pending = std::mem::take(&mut g.pending_pipeline_restarts);
        let mut out = Vec::with_capacity(pending.len());
        for old_id in pending {
            if let Some(mut sup) = g.pipeline_supervisors.remove(&old_id) {
+                if sup.spec.restart_max > 0 && sup.restart_count >= sup.spec.restart_max {
+                    tracing::warn!(
+                        label = %sup.spec.label,
+                        restart_count = sup.restart_count,
+                        max = sup.spec.restart_max,
+                        "pipeline restart_max reached — giving up"
+                    );
+                    continue; // no relaunch, supervisor discarded.
+                }
                sup.restart_count += 1;
                out.push(sup);
            }
@@ -1254,6 +1299,9 @@ mod tests {
            edges: vec![],
            discern: DiscernPolicy::default(),
            restart_on_failure: true,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        let pipeline_id = ulid::Ulid::new();
        // Simulamos lo que haría el daemon: registramos un comando como
@@ -210,6 +210,9 @@ mod tests {
            edges: vec![],
            discern: DiscernPolicy::default(),
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        mgr1.save_pipeline("daily".into(), spec).await;
        mgr1.save_snapshot(&path).await.unwrap();
@@ -595,6 +595,9 @@ mod tests {
            }],
            discern: DiscernPolicy::default(),
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        let disc = Arc::new(DiscernPipeline::default_pipeline());
        let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));
@@ -633,6 +636,9 @@ mod tests {
            ],
            discern: DiscernPolicy::default(),
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        let disc = Arc::new(DiscernPipeline::default_pipeline());
        let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));
@@ -670,6 +676,9 @@ mod tests {
            ],
            discern: DiscernPolicy::default(),
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        let disc = Arc::new(DiscernPipeline::default_pipeline());
        let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));
@@ -702,6 +711,9 @@ mod tests {
                replay_bytes: 0,
            },
            restart_on_failure: false,
+            restart_backoff_ms: 200,
+            restart_max_backoff_ms: 30_000,
+            restart_max: 0,
        };
        let disc = Arc::new(DiscernPipeline::default_pipeline());
        let inc = Arc::new(Incarnator::new(IncarnatorConfig::default()));