feat(shipote): quota enforce + cgroup memory.max + pipeline restart (fase L)

- WorkspaceSpec.quota_enforce: QuotaAction (None|Log|Kill) por recurso
  (mem, nproc). reap_dead aplica policy; Kill usa stop_with_grace(ZERO).
- ente_incarnate::cgroup::apply_rlimits_to_cgroup escribe memory.max y
  pids.max. WorkspaceManager::create_with_id lo invoca si soma.cgroup.path
  y delegation. Kernel hace OOM kill al exceder; falla silenciosa si no
  hay delegation.
- PipelineSpec.restart_on_failure: bool. register_pipeline_supervisor
  retiene spec; reap_dead detecta all-dead + any-failed → push a queue;
  daemon reaper drena y relanza pipeline ENTERO (los pipes intermedios
  no permiten restart parcial).

82 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8,
shipote-core 24, shipote-discern 5, yahweh-provider-fs 3).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
sergio
2026-05-11 10:22:46 +00:00
parent 324a0c2d5d
commit 4c9d1b4c1d
7 changed files with 401 additions and 5 deletions
+53 -1
View File
@@ -93,7 +93,8 @@ async fn main() -> anyhow::Result<()> {
let discerner = Arc::new(DiscernPipeline::default_pipeline());
// Reaper periódico cada 500 ms.
// Reaper periódico cada 500 ms. Además drena pipelines pendientes
// de restart (supervisión a nivel pipeline).
{
let mgr = mgr.clone();
tokio::spawn(async move {
@@ -101,6 +102,55 @@ async fn main() -> anyhow::Result<()> {
loop {
tick.tick().await;
mgr.reap_dead().await;
let pending = mgr.take_pending_restarts().await;
for sup in pending {
info!(
label = %sup.spec.label,
restart_count = sup.restart_count,
"pipeline restart: relaunching"
);
let inc = mgr.incarnator_handle();
let disc = std::sync::Arc::new(DiscernPipeline::default_pipeline());
let ws_label = mgr
.workspace_label(sup.spec.workspace)
.await
.unwrap_or_default();
let restart_count = sup.restart_count;
let workspace = sup.spec.workspace;
let tap = sup.tap;
let mut new_spec = sup.spec.clone();
// Mantener restart_on_failure para futuras fallas.
new_spec.restart_on_failure = true;
match shipote_core::pipeline::run_pipeline(
&new_spec,
&ws_label,
tap,
disc,
inc,
Some(mgr.clone()),
)
.await
{
Ok(launch) => {
mgr.register_pipeline_commands(workspace, launch.pipeline, launch.command_pids.clone())
.await;
// Re-registrar supervisor con el nuevo pipeline_id,
// preservando restart_count.
let mut s = shipote_core::PipelineSupervisor {
workspace,
spec: new_spec,
tap,
restart_count,
};
s.restart_count = restart_count;
mgr.register_pipeline_supervisor(launch.pipeline, workspace, s.spec.clone(), tap)
.await;
}
Err(e) => {
warn!(?e, "pipeline restart failed");
}
}
}
}
});
}
@@ -219,6 +269,7 @@ async fn dispatch(
announce_edges_to_broker(pool.as_deref(), &pipeline_id, &launch.edge_discernments);
let cmds = launch.command_pids;
mgr.register_pipeline_commands(spec.workspace, pipeline_id, cmds.clone()).await;
mgr.register_pipeline_supervisor(pipeline_id, spec.workspace, spec.clone(), tap).await;
let edges = launch.edge_discernments.into_iter().map(map_edge_to_info).collect();
Response::PipelineStarted {
pipeline: pipeline_id,
@@ -318,6 +369,7 @@ async fn dispatch(
announce_edges_to_broker(pool.as_deref(), &pipeline_id, &launch.edge_discernments);
let cmds = launch.command_pids;
mgr.register_pipeline_commands(spec.workspace, pipeline_id, cmds.clone()).await;
mgr.register_pipeline_supervisor(pipeline_id, spec.workspace, spec.clone(), tap).await;
let edges = launch.edge_discernments.into_iter().map(map_edge_to_info).collect();
Response::PipelineStarted {
pipeline: pipeline_id,