executus/run/checkpoint.go

package run

import (
	"context"
	"errors"
	"log/slog"

	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)

// Durable-recovery plumbing for the executor. The Checkpointer port (set via
// Ports.Checkpointer, a CheckpointerFactory) persists a run's resumable progress
// during the loop; on boot a host re-dispatches an interrupted run through the
// executor with a ResumeState (the saved transcript / completed phases) so it
// CONTINUES rather than restarting, reusing the SAME durable record via an
// existing Checkpointer. Both are carried into Run via the context (mirrors
// mort's agentexec.WithResumeState / WithExistingCheckpointer).

// ResumeState carries a recovered run's prior progress into Run so the run
// continues instead of restarting. The host's recovery path sets it via
// WithResumeState; the executor reads it:
//   - single-loop: History seeds the saved transcript (the run continues).
//   - multi-phase: CompletedPhases are skipped; the interrupted phase re-runs
//     from its start (boundary-granular — there is no mid-phase transcript
//     resume, so History is unused for multi-phase runs).
type ResumeState struct {
	History         []llm.Message // single-loop transcript (unused for multi-phase)
	CompletedPhases []PhaseOutput // multi-phase: outputs of finished phases, in order
}

type resumeStateKey struct{}

// WithResumeState carries a recovered run's prior progress into Run.
func WithResumeState(ctx context.Context, rs *ResumeState) context.Context {
	return context.WithValue(ctx, resumeStateKey{}, rs)
}

func resumeStateFromContext(ctx context.Context) *ResumeState {
	rs, _ := ctx.Value(resumeStateKey{}).(*ResumeState)
	return rs
}

type existingCheckpointerKey struct{}

// WithExistingCheckpointer carries a pre-existing Checkpointer into Run so a
// recovery re-run reuses the SAME durable record (the executor uses it instead of
// calling Ports.Checkpointer.Begin).
func WithExistingCheckpointer(ctx context.Context, cp Checkpointer) context.Context {
	return context.WithValue(ctx, existingCheckpointerKey{}, cp)
}

func existingCheckpointerFromContext(ctx context.Context) Checkpointer {
	cp, _ := ctx.Value(existingCheckpointerKey{}).(Checkpointer)
	return cp
}

// checkpointOutcome is the finalize decision for a durable run.
type checkpointOutcome int

const (
	checkpointComplete checkpointOutcome = iota
	checkpointLeaveRunning
	checkpointFail
)

// classifyCheckpointOutcome maps (run error, cancellation cause) to the durable
// finalize action: success clears the checkpoint (Complete); a shutdown-caused
// cancellation leaves the record so boot recovery picks it up (neither
// Complete nor Fail); anything else (model error, tool loop, the run's own
// deadline, a critic kill, a caller cancel) is terminal (Fail). Mirrors mort's
// agentexec.classifyCheckpointOutcome.
func classifyCheckpointOutcome(runErr, cause error) checkpointOutcome {
	switch {
	case runErr == nil:
		return checkpointComplete
	case errors.Is(cause, ErrShutdown):
		return checkpointLeaveRunning
	default:
		return checkpointFail
	}
}

// finalizeCheckpoint applies the outcome to the per-run checkpointer (nil-safe).
// Runs on a detached context so a cancelled run still records its terminal state.
// Complete/Fail errors are best-effort but logged (a stale record would only
// cause a wasteful boot-recovery retry, not data loss).
func finalizeCheckpoint(ctx context.Context, cp Checkpointer, runErr error, cause error) {
	if cp == nil {
		return
	}
	switch classifyCheckpointOutcome(runErr, cause) {
	case checkpointComplete:
		if err := cp.Complete(detach(ctx)); err != nil {
			slog.Warn("run: checkpoint Complete failed", "error", err)
		}
	case checkpointFail:
		if err := cp.Fail(detach(ctx), runErr); err != nil {
			slog.Warn("run: checkpoint Fail failed", "error", err)
		}
	case checkpointLeaveRunning:
		// Interrupted by shutdown: leave the record for boot recovery.
	}
}