feat(shipote): throughput card + rate-limit + snapshot incremental (fase Q)

- shipote-shell Flow channels card extiende con bytes_total + bytes/s por socket. Lookup helper evita borrows en closures. - DiscernPolicy.max_bytes_per_sec: splitter task hace sleep proporcional al tamaño de chunk tras cada broadcast. Token-bucket simple v1. - WorkspaceManager.dirty: AtomicBool. mark_dirty() en mutaciones que afectan al snapshot. save_snapshot skip si clean y path existe. restore_snapshot resetea dirty=false (hidratación no es mutation). 85 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 26, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-11 16:20:50 +00:00
parent 3486949d24
commit 18c0344a52
5 changed files with 134 additions and 25 deletions
@@ -6,8 +6,8 @@

 use gpui::{div, prelude::*, px, Context, IntoElement, Render, SharedString, Window};
 use shipote_protocol::{
-    default_socket_path, read_frame, write_frame, CommandInfo, FlowInfo, QuotaReportInfo, Request,
-    Response, WorkspaceStatsInfo, WorkspaceSummary,
+    default_socket_path, read_frame, write_frame, CommandInfo, FlowInfo, FlowThroughputInfo,
+    QuotaReportInfo, Request, Response, WorkspaceStatsInfo, WorkspaceSummary,
 };
 use std::path::PathBuf;
 use std::time::Duration;
@@ -44,6 +44,8 @@ struct Shell {
    commands: std::collections::BTreeMap<String, Vec<CommandInfo>>,
    saved_pipelines: Vec<String>,
    flows: Vec<FlowInfo>,
+    /// Throughput por flow socket (bytes_total + bytes/s).
+    flow_throughput: Vec<FlowThroughputInfo>,
    /// History de RSS por workspace (últimas N samples).
    stats_history: std::collections::BTreeMap<String, std::collections::VecDeque<WorkspaceStatsInfo>>,
    /// Quota report fresco por workspace.
@@ -81,6 +83,7 @@ impl Shell {
                            me.commands = snap.commands;
                            me.saved_pipelines = snap.saved_pipelines;
                            me.flows = snap.flows;
+                            me.flow_throughput = snap.flow_throughput;
                            me.quotas = snap.quotas;
                            // Hidratar history server-side para workspaces
                            // que no tenían history local (primer probe).
@@ -122,6 +125,7 @@ impl Shell {
                            me.commands.clear();
                            me.saved_pipelines.clear();
                            me.flows.clear();
+                            me.flow_throughput.clear();
                            me.quotas.clear();
                            me.caps = None;
                            me.recent_log = None;
@@ -142,6 +146,7 @@ impl Shell {
            commands: std::collections::BTreeMap::new(),
            saved_pipelines: Vec::new(),
            flows: Vec::new(),
+            flow_throughput: Vec::new(),
            stats_history: std::collections::BTreeMap::new(),
            quotas: std::collections::BTreeMap::new(),
            caps: None,
@@ -157,6 +162,7 @@ struct Snapshot {
    commands: std::collections::BTreeMap<String, Vec<CommandInfo>>,
    saved_pipelines: Vec<String>,
    flows: Vec<FlowInfo>,
+    flow_throughput: Vec<FlowThroughputInfo>,
    /// Stats fresco por workspace (id.toString → stats).
    fresh_stats: std::collections::BTreeMap<String, WorkspaceStatsInfo>,
    /// Quota report fresco por workspace.
@@ -254,6 +260,17 @@ fn probe_blocking(path: &std::path::Path) -> Result<Snapshot, String> {
            Response::FlowList { items } => items,
            _ => Vec::new(),
        };
+        // Throughput per-socket.
+        write_frame(&mut stream, &Request::FlowThroughput)
+            .await
+            .map_err(|e| format!("write throughput: {e}"))?;
+        let resp: Response = read_frame(&mut stream)
+            .await
+            .map_err(|e| format!("read throughput: {e}"))?;
+        let flow_throughput = match resp {
+            Response::FlowThroughput { items } => items,
+            _ => Vec::new(),
+        };

        // Live tail: log del comando más reciente con bytes>0.
        let recent_log = {
@@ -330,6 +347,7 @@ fn probe_blocking(path: &std::path::Path) -> Result<Snapshot, String> {
            commands: commands_map,
            saved_pipelines,
            flows,
+            flow_throughput,
            fresh_stats,
            quotas,
            hydrate_history,
@@ -509,31 +527,38 @@ impl Render for Shell {
            "ws_suffix · recurso · uso > limit".to_string()
        };

-        // Flow channels (data plane).
+        // Flow channels (data plane) con throughput.
        let flow_count: usize = self.flows.iter().map(|f| f.sockets.len()).sum();
-        let flow_items: Vec<String> = self
-            .flows
-            .iter()
-            .flat_map(|f| {
-                let pipe = f.pipeline.to_string();
-                let short = &pipe[pipe.len() - 6..];
-                f.sockets
-                    .iter()
-                    .map(move |s| {
-                        format!(
-                            "{short}  {}",
-                            s.file_name()
-                                .map(|n| n.to_string_lossy().to_string())
-                                .unwrap_or_else(|| s.display().to_string())
-                        )
-                    })
-                    .collect::<Vec<_>>()
-            })
-            .collect();
+        // Lookup helper que NO captura por ref (evita issue de borrow
+        // en el closure de flat_map).
+        let find_tp = |s: &std::path::PathBuf| -> (f64, f64) {
+            for t in &self.flow_throughput {
+                if t.socket == *s {
+                    return (t.bytes_total as f64 / 1024.0, t.bytes_per_sec / 1024.0);
+                }
+            }
+            (0.0, 0.0)
+        };
+        let mut flow_items: Vec<String> = Vec::new();
+        for f in &self.flows {
+            let pipe = f.pipeline.to_string();
+            let short_pipe = &pipe[pipe.len() - 6..];
+            for s in &f.sockets {
+                let name = s
+                    .file_name()
+                    .map(|n| n.to_string_lossy().to_string())
+                    .unwrap_or_else(|| s.display().to_string());
+                let (total_kib, rate_kib) = find_tp(s);
+                flow_items.push(format!(
+                    "{short_pipe}  {:<48}  {:>7.1} KiB  {:>6.2} KiB/s",
+                    name, total_kib, rate_kib
+                ));
+            }
+        }
        let flow_descr = if flow_count == 0 {
            "pipelines con --tap exponen sockets aquí".to_string()
        } else {
-            "shipote flow tail <socket> para suscribirse".to_string()
+            "pipe6 · socket · total · rate".to_string()
        };

        let body = div()
@@ -274,6 +274,11 @@ pub struct DiscernPolicy {
    /// productores con chunks de tamaño variable.
    #[serde(default)]
    pub replay_bytes: usize,
+    /// Rate-limit del flow channel (bytes/s). `0` = sin límite. Si está
+    /// definido, el splitter sleeps proporcional al tamaño del chunk
+    /// antes de re-broadcastear. Protege subscribers lentos.
+    #[serde(default)]
+    pub max_bytes_per_sec: u64,
 }

 impl Default for DiscernPolicy {
@@ -283,6 +288,7 @@ impl Default for DiscernPolicy {
            enrich_producer: default_true(),
            replay_chunks: default_replay_chunks(),
            replay_bytes: 0,
+            max_bytes_per_sec: 0,
        }
    }
 }
@@ -87,6 +87,10 @@ pub enum LogStream {
 pub struct WorkspaceManager {
    inner: Arc<Mutex<Inner>>,
    incarnator: Arc<Incarnator>,
+    /// True si hubo alguna mutación desde el último `save_snapshot`.
+    /// `save_snapshot` skip si false (snapshot incremental — evita
+    /// re-serialize cuando nada cambió, ej. SIGTERM tras un período idle).
+    dirty: std::sync::atomic::AtomicBool,
 }

 struct Inner {
@@ -238,9 +242,23 @@ impl WorkspaceManager {
                pending_pipeline_restarts: Vec::new(),
            })),
            incarnator: Arc::new(Incarnator::new(cfg)),
+            dirty: std::sync::atomic::AtomicBool::new(false),
        }
    }

+    /// Marca el manager como dirty. Cualquier mutación que afecta al
+    /// snapshot debería llamar esto.
+    #[inline]
+    fn mark_dirty(&self) {
+        self.dirty.store(true, std::sync::atomic::Ordering::Relaxed);
+    }
+
+    /// True si hubo cambios desde el último `save_snapshot`. Útil para
+    /// chequeos cooperativos (ej. monitoring que pollea cada N).
+    pub fn is_dirty(&self) -> bool {
+        self.dirty.load(std::sync::atomic::Ordering::Relaxed)
+    }
+
    /// Registra un supervisor para un pipeline con `restart_on_failure=true`.
    /// El daemon llama esto tras `run_pipeline` para que `reap_dead` agregue
    /// el pipeline a la cola de restart cuando algún command falle.
@@ -267,6 +285,8 @@ impl WorkspaceManager {
                current_backoff_ms: initial_backoff,
            },
        );
+        drop(g);
+        self.mark_dirty();
    }

    /// Variante que preserva backoff/count del supervisor anterior (para
@@ -480,6 +500,7 @@ impl WorkspaceManager {
    /// Guarda (o reemplaza) un PipelineSpec bajo `name`.
    pub async fn save_pipeline(&self, name: String, spec: PipelineSpec) {
        self.inner.lock().await.saved_pipelines.insert(name, spec);
+        self.mark_dirty();
    }

    /// Devuelve los nombres de los pipelines guardados.
@@ -497,7 +518,11 @@ impl WorkspaceManager {

    /// Elimina un saved pipeline.
    pub async fn drop_saved_pipeline(&self, name: &str) -> bool {
-        self.inner.lock().await.saved_pipelines.remove(name).is_some()
+        let existed = self.inner.lock().await.saved_pipelines.remove(name).is_some();
+        if existed {
+            self.mark_dirty();
+        }
+        existed
    }

    /// Label del workspace, si existe.
@@ -648,6 +673,7 @@ impl WorkspaceManager {
            stats_history: std::collections::VecDeque::with_capacity(STATS_HISTORY_CAP),
        };
        self.inner.lock().await.workspaces.insert(id, state);
+        self.mark_dirty();
        info!(%id, ?ttl, "workspace created");

        // Si tiene TTL, programar auto-stop. El task captura un weak ref
@@ -698,6 +724,7 @@ impl WorkspaceManager {
        // También limpiamos flow_channels del workspace si los hubiera —
        // por workspace lo retenemos por pipeline, no por workspace.
        drop(g);
+        self.mark_dirty();

        // 1) SIGTERM (o SIGKILL si grace=0) a todos vivos.
        let initial_signal = if grace.is_zero() { Signal::SIGKILL } else { Signal::SIGTERM };
@@ -181,10 +181,18 @@ impl WorkspaceManager {
        }
    }

-    /// Escribe snapshot a disco.
+    /// Escribe snapshot a disco. Si `is_dirty()` es false **y** el path
+    /// existe (snapshot previo válido), skip la escritura.
    pub async fn save_snapshot(&self, path: &Path) -> anyhow::Result<()> {
+        if !self.is_dirty() && path.exists() {
+            info!(path = %path.display(), "snapshot SKIPPED (clean)");
+            return Ok(());
+        }
        let snap = self.snapshot().await;
        snap.write(path)?;
+        // Clear dirty: lo que está en disco es el current state.
+        self.dirty
+            .store(false, std::sync::atomic::Ordering::Relaxed);
        info!(path = %path.display(), workspaces = snap.workspaces.len(), "snapshot saved");
        Ok(())
    }
@@ -245,6 +253,11 @@ impl WorkspaceManager {
            out.saved_pipelines_restored += 1;
        }
        out.live_pipelines = snap.live_pipelines;
+        // Restore no cuenta como mutación — lo que está en disco es lo
+        // que acabamos de cargar. Sin esto, el próximo SIGTERM siempre
+        // re-escribiría aunque no hubiese cambios reales.
+        self.dirty
+            .store(false, std::sync::atomic::Ordering::Relaxed);
        info!(
            workspaces = out.workspaces_restored,
            saved_pipelines = out.saved_pipelines_restored,
@@ -304,6 +317,24 @@ mod tests {
        assert!(restored_ids.contains(&id2));
    }

+    #[tokio::test]
+    async fn save_snapshot_skips_when_clean() {
+        let tmp = tempfile::tempdir().unwrap();
+        let path = tmp.path().join("state.json");
+        let mgr = Arc::new(WorkspaceManager::new(IncarnatorConfig::default()));
+        let _ = mgr.create(sample_ws("dirty-test")).await.unwrap();
+        assert!(mgr.is_dirty(), "create debería marcar dirty");
+        mgr.save_snapshot(&path).await.unwrap();
+        assert!(!mgr.is_dirty(), "save_snapshot debería limpiar dirty");
+        let mtime1 = std::fs::metadata(&path).unwrap().modified().unwrap();
+        // Esperamos un pelín para que mtime cambie si fuera re-escrito.
+        tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+        // Segundo save sin mutación → skip.
+        mgr.save_snapshot(&path).await.unwrap();
+        let mtime2 = std::fs::metadata(&path).unwrap().modified().unwrap();
+        assert_eq!(mtime1, mtime2, "skip cuando clean — mtime no cambia");
+    }
+
    #[tokio::test]
    async fn snapshot_includes_saved_pipelines() {
        use shipote_card::{CommandRef, DiscernPolicy, PipelineSpec};
@@ -132,6 +132,7 @@ pub async fn run_pipeline(
            edges: edge_meta,
            tap,
            sample_bytes: spec.discern.sample_bytes,
+            max_bytes_per_sec: spec.discern.max_bytes_per_sec,
        });
    }

@@ -308,6 +309,9 @@ struct SplitterSpec {
    edges: Vec<EdgeMeta>,
    tap: bool,
    sample_bytes: usize,
+    /// Rate-limit en bytes/s (0 = sin limit). Tras cada chunk de `n`
+    /// bytes, splitter sleeps `n / max_bytes_per_sec` segundos.
+    max_bytes_per_sec: u64,
 }

 struct SplitterHandle {
@@ -430,6 +434,7 @@ fn spawn_splitter(
            }
            broadcast_chunk(&writers, &edge_senders, &buf[..n]).await;
            total += n as u64;
+            rate_limit_sleep(spec.max_bytes_per_sec, n).await;
        }

        let d = if spec.tap {
@@ -448,6 +453,7 @@ fn spawn_splitter(
            if n == 0 { break; }
            broadcast_chunk(&writers, &edge_senders, &buf[..n]).await;
            total += n as u64;
+            rate_limit_sleep(spec.max_bytes_per_sec, n).await;
        }
        debug!(bytes = total, consumers = writers.len(), "splitter finished");

@@ -469,6 +475,19 @@ fn spawn_splitter(
    SplitterHandle { handle }
 }

+/// Token-bucket simple: si `max_bps > 0`, sleep `chunk_size / max_bps`
+/// segundos. Implementación crude pero suficiente para v1.
+async fn rate_limit_sleep(max_bps: u64, chunk_bytes: usize) {
+    if max_bps == 0 {
+        return;
+    }
+    let secs = chunk_bytes as f64 / max_bps as f64;
+    let ms = (secs * 1000.0) as u64;
+    if ms > 0 {
+        tokio::time::sleep(std::time::Duration::from_millis(ms)).await;
+    }
+}
+
 async fn broadcast_chunk(
    writers: &[AsyncFd<std::os::fd::OwnedFd>],
    edge_senders: &[Option<crate::flow_channel::FlowSender>],
@@ -721,6 +740,7 @@ mod tests {
                enrich_producer: true,
                replay_chunks: 32,
                replay_bytes: 0,
+                max_bytes_per_sec: 0,
            },
            restart_on_failure: false,
            restart_backoff_ms: 200,