package run import ( "context" "errors" "log/slog" "gitea.stevedudenhoeffer.com/steve/majordomo/llm" ) // Durable-recovery plumbing for the executor. The Checkpointer port (set via // Ports.Checkpointer, a CheckpointerFactory) persists a run's resumable progress // during the loop; on boot a host re-dispatches an interrupted run through the // executor with a ResumeState (the saved transcript / completed phases) so it // CONTINUES rather than restarting, reusing the SAME durable record via an // existing Checkpointer. Both are carried into Run via the context (mirrors // mort's agentexec.WithResumeState / WithExistingCheckpointer). // ResumeState carries a recovered run's prior progress into Run so the run // continues instead of restarting. The host's recovery path sets it via // WithResumeState; the executor reads it: // - single-loop: History seeds the saved transcript (the run continues). // - multi-phase: CompletedPhases are skipped; the interrupted phase re-runs // from its start (boundary-granular — there is no mid-phase transcript // resume, so History is unused for multi-phase runs). type ResumeState struct { History []llm.Message // single-loop transcript (unused for multi-phase) CompletedPhases []PhaseOutput // multi-phase: outputs of finished phases, in order } type resumeStateKey struct{} // WithResumeState carries a recovered run's prior progress into Run. func WithResumeState(ctx context.Context, rs *ResumeState) context.Context { return context.WithValue(ctx, resumeStateKey{}, rs) } func resumeStateFromContext(ctx context.Context) *ResumeState { rs, _ := ctx.Value(resumeStateKey{}).(*ResumeState) return rs } type existingCheckpointerKey struct{} // WithExistingCheckpointer carries a pre-existing Checkpointer into Run so a // recovery re-run reuses the SAME durable record (the executor uses it instead of // calling Ports.Checkpointer.Begin). func WithExistingCheckpointer(ctx context.Context, cp Checkpointer) context.Context { return context.WithValue(ctx, existingCheckpointerKey{}, cp) } func existingCheckpointerFromContext(ctx context.Context) Checkpointer { cp, _ := ctx.Value(existingCheckpointerKey{}).(Checkpointer) return cp } // checkpointOutcome is the finalize decision for a durable run. type checkpointOutcome int const ( checkpointComplete checkpointOutcome = iota checkpointLeaveRunning checkpointFail ) // classifyCheckpointOutcome maps (run error, cancellation cause) to the durable // finalize action: success clears the checkpoint (Complete); a shutdown-caused // cancellation leaves the record so boot recovery picks it up (neither // Complete nor Fail); anything else (model error, tool loop, the run's own // deadline, a critic kill, a caller cancel) is terminal (Fail). Mirrors mort's // agentexec.classifyCheckpointOutcome. func classifyCheckpointOutcome(runErr, cause error) checkpointOutcome { switch { case runErr == nil: return checkpointComplete case errors.Is(cause, ErrShutdown): return checkpointLeaveRunning default: return checkpointFail } } // finalizeCheckpoint applies the outcome to the per-run checkpointer (nil-safe). // Runs on a detached context so a cancelled run still records its terminal state. // Complete/Fail errors are best-effort but logged (a stale record would only // cause a wasteful boot-recovery retry, not data loss). func finalizeCheckpoint(ctx context.Context, cp Checkpointer, runErr error, cause error) { if cp == nil { return } switch classifyCheckpointOutcome(runErr, cause) { case checkpointComplete: if err := cp.Complete(detach(ctx)); err != nil { slog.Warn("run: checkpoint Complete failed", "error", err) } case checkpointFail: if err := cp.Fail(detach(ctx), runErr); err != nil { slog.Warn("run: checkpoint Fail failed", "error", err) } case checkpointLeaveRunning: // Interrupted by shutdown: leave the record for boot recovery. } }