feat(shipote): health endpoint + audit log + token-bucket real (fase R)

- Request::Health → Response::Health { version, uptime_ms, alive_*, active_flows, dirty }. CLI: shipote health. - handle_client lee peer_uid una vez al accept. audit_request emite info!(target: "audit", uid, action, detail) por mutación (create/stop/ run/pipeline.*/flow.drop). Reads omitidos. Filtrable con SHIPOTE_LOG= warn,audit=info. - TokenBucket real reemplaza rate_limit_sleep: refill por wall time, capacity = 1s de rate, debt negativo dispara sleep proporcional. Permite burst real, no chunk-by-chunk uniforme. 85 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 26, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-11 16:58:10 +00:00
parent 18c0344a52
commit a9124449f9
5 changed files with 180 additions and 13 deletions
@@ -24,6 +24,9 @@ enum Cmd {
    /// Health-check del daemon.
    Ping,
    /// Health endpoint estructurado.
    Health,
    /// Capacidades runtime detectadas por el daemon.
    Caps,
@@ -195,6 +198,30 @@ async fn main() -> Result<()> {
            }
        }
        Cmd::Health => {
            let resp = round_trip(&mut stream, Request::Health).await?;
            match resp {
                Response::Health {
                    version,
                    uptime_ms,
                    alive_workspaces,
                    alive_commands,
                    alive_pipelines,
                    active_flows,
                    dirty,
                } => {
                    println!("version:           {version}");
                    println!("uptime:            {} ms", uptime_ms);
                    println!("alive_workspaces:  {alive_workspaces}");
                    println!("alive_commands:    {alive_commands}");
                    println!("alive_pipelines:   {alive_pipelines}");
                    println!("active_flows:      {active_flows}");
                    println!("dirty:             {dirty}");
                }
                other => print_unexpected(&other),
            }
        }
        Cmd::Caps => {
            let resp = round_trip(&mut stream, Request::Capabilities).await?;
            match resp {
@@ -38,6 +38,7 @@ async fn main() -> anyhow::Result<()> {
    }
    let listener = UnixListener::bind(&sock).with_context(|| format!("bind {}", sock.display()))?;
    info!(socket = %sock.display(), "shipote-daemon listening");
    let daemon_started = std::time::Instant::now();
    // Sidecar pool: una sesión global del daemon + N sesiones efímeras
    // por edge enriquecido tras cada pipeline tap.
@@ -241,7 +242,7 @@ async fn main() -> anyhow::Result<()> {
                let disc = discerner.clone();
                let pool = sidecar_pool.clone();
                tokio::spawn(async move {
-                    if let Err(e) = handle_client(stream, mgr, disc, pool).await {
+                    if let Err(e) = handle_client(stream, mgr, disc, pool, daemon_started).await {
                        warn!(?e, "client handler error");
                    }
                });
@@ -280,27 +281,80 @@ async fn handle_client(
    mgr: Arc<WorkspaceManager>,
    disc: Arc<DiscernPipeline>,
    pool: Option<Arc<brahman_sidecar::SidecarPool>>,
    daemon_started: std::time::Instant,
 ) -> anyhow::Result<()> {
    // Audit: peer uid lo leemos una vez aquí (no cambia durante la conexión).
    let peer = peer_uid(&stream).unwrap_or(u32::MAX);
    loop {
        let req: Request = match read_frame(&mut stream).await {
            Ok(r) => r,
            Err(shipote_protocol::ProtocolError::Closed) => return Ok(()),
            Err(e) => return Err(e.into()),
        };
-        let resp = dispatch(&mgr, &disc, &pool, req).await;
+        audit_request(peer, &req);
        let resp = dispatch(&mgr, &disc, &pool, daemon_started, req).await;
        write_frame(&mut stream, &resp).await?;
    }
 }
 /// Loguea cada mutación con target="audit" y el peer uid. Reads (ping,
 /// list, stats) se omiten para no inundar el log.
 fn audit_request(peer_uid: u32, req: &Request) {
    let (action, detail) = match req {
        Request::WorkspaceCreate { spec } => ("workspace.create", format!("label={}", spec.label)),
        Request::WorkspaceStop { id, grace_ms } => ("workspace.stop", format!("id={id} grace_ms={grace_ms}")),
        Request::Run { workspace, exec, restart_on_failure, .. } => (
            "run",
            format!("ws={workspace} exec={exec} restart={restart_on_failure}"),
        ),
        Request::PipelineRun { spec, tap, .. } => ("pipeline.run", format!("label={} tap={tap}", spec.label)),
        Request::PipelineRunSaved { name, tap, .. } => ("pipeline.run-saved", format!("name={name} tap={tap}")),
        Request::PipelineStop { pipeline, grace_ms } => ("pipeline.stop", format!("id={pipeline} grace_ms={grace_ms}")),
        Request::PipelineSave { name, .. } => ("pipeline.save", format!("name={name}")),
        Request::PipelineDrop { name } => ("pipeline.drop", format!("name={name}")),
        Request::FlowDrop { pipeline } => ("flow.drop", format!("pipeline={pipeline}")),
        // Reads (no audit):
        Request::Ping
        | Request::Health
        | Request::WorkspaceList
        | Request::WorkspaceStats { .. }
        | Request::WorkspaceQuota { .. }
        | Request::WorkspaceStatsHistory { .. }
        | Request::WorkspaceFullSummary { .. }
        | Request::CommandList { .. }
        | Request::CommandLogs { .. }
        | Request::PipelineSavedList
        | Request::FlowList
        | Request::FlowThroughput
        | Request::Discern { .. }
        | Request::Capabilities => return,
    };
    info!(target: "audit", uid = peer_uid, action, detail = %detail, "audit");
 }
 async fn dispatch(
    mgr: &Arc<WorkspaceManager>,
    disc: &DiscernPipeline,
    pool: &Option<Arc<brahman_sidecar::SidecarPool>>,
    daemon_started: std::time::Instant,
    req: Request,
 ) -> Response {
    match req {
        Request::Ping => Response::Pong,
        Request::Health => {
            let counts = mgr.health_counts().await;
            Response::Health {
                version: env!("CARGO_PKG_VERSION").to_string(),
                uptime_ms: daemon_started.elapsed().as_millis() as u64,
                alive_workspaces: counts.alive_workspaces,
                alive_commands: counts.alive_commands,
                alive_pipelines: counts.alive_pipelines,
                active_flows: counts.active_flows,
                dirty: mgr.is_dirty(),
            }
        }
        Request::WorkspaceCreate { spec } => match mgr.create(spec).await {
            Ok((id, warnings)) => Response::WorkspaceCreated { id, warnings },
            Err(e) => Response::Error { message: format!("{e}") },
@@ -148,6 +148,14 @@ pub struct CommandSummary {
    pub pid: i32,
 }
 #[derive(Debug, Clone, Default)]
 pub struct HealthCounts {
    pub alive_workspaces: u32,
    pub alive_commands: u32,
    pub alive_pipelines: u32,
    pub active_flows: u32,
 }
 #[derive(Debug, Clone)]
 pub struct CommandInfo {
    pub id: Ulid,
@@ -448,6 +456,25 @@ impl WorkspaceManager {
        self.inner.lock().await.pipeline_flows.insert(pipeline, flows);
    }
    /// Snapshot de counts agregados para health endpoint.
    pub async fn health_counts(&self) -> HealthCounts {
        let g = self.inner.lock().await;
        let alive_workspaces = g.workspaces.len() as u32;
        let alive_commands: u32 = g
            .workspaces
            .values()
            .map(|ws| ws.commands.values().filter(|c| c.alive).count() as u32)
            .sum();
        let alive_pipelines = g.pipeline_supervisors.len() as u32;
        let active_flows: u32 = g.pipeline_flows.values().map(|v| v.len() as u32).sum();
        HealthCounts {
            alive_workspaces,
            alive_commands,
            alive_pipelines,
            active_flows,
        }
    }
    /// Lista pipelines vivos con sus sockets activos.
    pub async fn list_flow_pipelines(&self) -> Vec<(Ulid, Vec<std::path::PathBuf>)> {
        let g = self.inner.lock().await;
@@ -419,6 +419,11 @@ fn spawn_splitter(
        let mut buf = [0u8; 4096];
        let mut total: u64 = 0;
        let mut eof = false;
        let mut bucket = if spec.max_bytes_per_sec > 0 {
            Some(TokenBucket::new(spec.max_bytes_per_sec))
        } else {
            None
        };
        // Fase 1: sampling (sólo si tap=true) + replicación.
        while !eof && (spec.tap && sample.len() < spec.sample_bytes) {
@@ -432,9 +437,16 @@ fn spawn_splitter(
                let take = n.min(spec.sample_bytes - sample.len());
                sample.extend_from_slice(&buf[..take]);
            }
            // Token bucket: reserva ANTES de broadcast — si hay debt,
            // sleep antes de mandar al subscriber.
            if let Some(b) = bucket.as_mut() {
                let wait = b.reserve(n as u64);
                if !wait.is_zero() {
                    tokio::time::sleep(wait).await;
                }
            }
            broadcast_chunk(&writers, &edge_senders, &buf[..n]).await;
            total += n as u64;
            rate_limit_sleep(spec.max_bytes_per_sec, n).await;
        }
        let d = if spec.tap {
@@ -451,9 +463,14 @@ fn spawn_splitter(
                Err(_) => break,
            };
            if n == 0 { break; }
            if let Some(b) = bucket.as_mut() {
                let wait = b.reserve(n as u64);
                if !wait.is_zero() {
                    tokio::time::sleep(wait).await;
                }
            }
            broadcast_chunk(&writers, &edge_senders, &buf[..n]).await;
            total += n as u64;
            rate_limit_sleep(spec.max_bytes_per_sec, n).await;
        }
        debug!(bytes = total, consumers = writers.len(), "splitter finished");
@@ -475,16 +492,45 @@ fn spawn_splitter(
    SplitterHandle { handle }
 }
-/// Token-bucket simple: si `max_bps > 0`, sleep `chunk_size / max_bps`
+/// Token-bucket real con capacidad de burst.
-/// segundos. Implementación crude pero suficiente para v1.
+/// - `rate_bps`: tokens (bytes) por segundo de refill.
-async fn rate_limit_sleep(max_bps: u64, chunk_bytes: usize) {
+/// - `capacity`: máx tokens acumulables. Default = 1 segundo de rate.
-    if max_bps == 0 {
+/// - `tokens`: tokens disponibles (puede negativos para "debt").
-        return;
+/// - `last_refill`: para calcular cuántos refill desde la última call.
 struct TokenBucket {
    rate_bps: u64,
    capacity: u64,
    tokens: f64,
    last_refill: std::time::Instant,
 }
 impl TokenBucket {
    fn new(rate_bps: u64) -> Self {
        Self {
            rate_bps,
            capacity: rate_bps, // 1 second worth of burst.
            tokens: rate_bps as f64,
            last_refill: std::time::Instant::now(),
        }
    }
-    let secs = chunk_bytes as f64 / max_bps as f64;
+
-    let ms = (secs * 1000.0) as u64;
+    /// Refill desde la última call según wall time. Reserva `cost`
-    if ms > 0 {
+    /// tokens; si no alcanza, retorna el sleep necesario.
-        tokio::time::sleep(std::time::Duration::from_millis(ms)).await;
+    fn reserve(&mut self, cost: u64) -> std::time::Duration {
        let now = std::time::Instant::now();
        let elapsed_secs = now.duration_since(self.last_refill).as_secs_f64();
        self.tokens = (self.tokens + elapsed_secs * self.rate_bps as f64)
            .min(self.capacity as f64);
        self.last_refill = now;
        self.tokens -= cost as f64;
        if self.tokens >= 0.0 {
            std::time::Duration::ZERO
        } else {
            // Debt: tiempo para recuperar a 0 tokens.
            let secs_needed = -self.tokens / self.rate_bps as f64;
            std::time::Duration::from_secs_f64(secs_needed)
        }
    }
 }
@@ -30,6 +30,9 @@ pub enum Request {
    /// Health-check.
    Ping,
    /// Health endpoint estructurado: versión + uptime + counts.
    Health,
    /// Crear un workspace nuevo.
    WorkspaceCreate { spec: WorkspaceSpec },
@@ -145,6 +148,16 @@ pub enum Request {
 pub enum Response {
    Pong,
    Health {
        version: String,
        uptime_ms: u64,
        alive_workspaces: u32,
        alive_commands: u32,
        alive_pipelines: u32,
        active_flows: u32,
        dirty: bool,
    },
    WorkspaceCreated {
        id: WorkspaceId,
        warnings: Vec<String>,