feat(shipote): quota enforce + cgroup memory.max + pipeline restart (fase L)
- WorkspaceSpec.quota_enforce: QuotaAction (None|Log|Kill) por recurso (mem, nproc). reap_dead aplica policy; Kill usa stop_with_grace(ZERO). - ente_incarnate::cgroup::apply_rlimits_to_cgroup escribe memory.max y pids.max. WorkspaceManager::create_with_id lo invoca si soma.cgroup.path y delegation. Kernel hace OOM kill al exceder; falla silenciosa si no hay delegation. - PipelineSpec.restart_on_failure: bool. register_pipeline_supervisor retiene spec; reap_dead detecta all-dead + any-failed → push a queue; daemon reaper drena y relanza pipeline ENTERO (los pipes intermedios no permiten restart parcial). 82 tests pasan (ente-incarnate 16, nouser-core 27, shipote-card 8, shipote-core 24, shipote-discern 5, yahweh-provider-fs 3). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -93,7 +93,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let discerner = Arc::new(DiscernPipeline::default_pipeline());
|
||||
|
||||
// Reaper periódico cada 500 ms.
|
||||
// Reaper periódico cada 500 ms. Además drena pipelines pendientes
|
||||
// de restart (supervisión a nivel pipeline).
|
||||
{
|
||||
let mgr = mgr.clone();
|
||||
tokio::spawn(async move {
|
||||
@@ -101,6 +102,55 @@ async fn main() -> anyhow::Result<()> {
|
||||
loop {
|
||||
tick.tick().await;
|
||||
mgr.reap_dead().await;
|
||||
let pending = mgr.take_pending_restarts().await;
|
||||
for sup in pending {
|
||||
info!(
|
||||
label = %sup.spec.label,
|
||||
restart_count = sup.restart_count,
|
||||
"pipeline restart: relaunching"
|
||||
);
|
||||
let inc = mgr.incarnator_handle();
|
||||
let disc = std::sync::Arc::new(DiscernPipeline::default_pipeline());
|
||||
let ws_label = mgr
|
||||
.workspace_label(sup.spec.workspace)
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
let restart_count = sup.restart_count;
|
||||
let workspace = sup.spec.workspace;
|
||||
let tap = sup.tap;
|
||||
let mut new_spec = sup.spec.clone();
|
||||
// Mantener restart_on_failure para futuras fallas.
|
||||
new_spec.restart_on_failure = true;
|
||||
match shipote_core::pipeline::run_pipeline(
|
||||
&new_spec,
|
||||
&ws_label,
|
||||
tap,
|
||||
disc,
|
||||
inc,
|
||||
Some(mgr.clone()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(launch) => {
|
||||
mgr.register_pipeline_commands(workspace, launch.pipeline, launch.command_pids.clone())
|
||||
.await;
|
||||
// Re-registrar supervisor con el nuevo pipeline_id,
|
||||
// preservando restart_count.
|
||||
let mut s = shipote_core::PipelineSupervisor {
|
||||
workspace,
|
||||
spec: new_spec,
|
||||
tap,
|
||||
restart_count,
|
||||
};
|
||||
s.restart_count = restart_count;
|
||||
mgr.register_pipeline_supervisor(launch.pipeline, workspace, s.spec.clone(), tap)
|
||||
.await;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(?e, "pipeline restart failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -219,6 +269,7 @@ async fn dispatch(
|
||||
announce_edges_to_broker(pool.as_deref(), &pipeline_id, &launch.edge_discernments);
|
||||
let cmds = launch.command_pids;
|
||||
mgr.register_pipeline_commands(spec.workspace, pipeline_id, cmds.clone()).await;
|
||||
mgr.register_pipeline_supervisor(pipeline_id, spec.workspace, spec.clone(), tap).await;
|
||||
let edges = launch.edge_discernments.into_iter().map(map_edge_to_info).collect();
|
||||
Response::PipelineStarted {
|
||||
pipeline: pipeline_id,
|
||||
@@ -318,6 +369,7 @@ async fn dispatch(
|
||||
announce_edges_to_broker(pool.as_deref(), &pipeline_id, &launch.edge_discernments);
|
||||
let cmds = launch.command_pids;
|
||||
mgr.register_pipeline_commands(spec.workspace, pipeline_id, cmds.clone()).await;
|
||||
mgr.register_pipeline_supervisor(pipeline_id, spec.workspace, spec.clone(), tap).await;
|
||||
let edges = launch.edge_discernments.into_iter().map(map_edge_to_info).collect();
|
||||
Response::PipelineStarted {
|
||||
pipeline: pipeline_id,
|
||||
|
||||
Reference in New Issue
Block a user