P4: checkpoint battery — durable-resume seam + run.Checkpointer handle
Plugs into run.Ports.Checkpointer (the executor's call site is a P2 follow-up;
this provides the seam + impls ahead of it):
- checkpoint.go: CheckpointStore seam + RunCheckpoint{Meta, Messages, Iteration,
ActivePhase} + RunCheckpointMeta (mirrors mort's agentexec types).
- handle.go: New(store, meta, throttle, now) -> run.Checkpointer. Save writes a
throttled snapshot; Complete/Fail delete it (a cleanly finished or terminally
failed run is NOT a recovery candidate; a shutdown-interrupted run never calls
them, so its checkpoint survives ListInterrupted at boot). nil store -> no-op.
- memory.go: NewMemory() default (with the honest caveat that in-memory does
not survive the restart it exists to recover from — a durable store is mort's).
Tests: save+complete clears the recovery candidate; throttle skips in-window
saves; nil-store is a clean no-op. Core imports ZERO from checkpoint.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,55 @@
|
||||
package checkpoint
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Memory is a zero-dependency in-process CheckpointStore. NOTE: an in-memory
|
||||
// checkpoint store does NOT survive the process restart it exists to recover
|
||||
// from — it is the test/light-host default and makes ListInterrupted meaningful
|
||||
// only within a single process lifetime. A host that wants real
|
||||
// crash-recovery wires a durable CheckpointStore (mort's durable-job table).
|
||||
type Memory struct {
|
||||
mu sync.RWMutex
|
||||
cps map[string]RunCheckpoint // by run id
|
||||
}
|
||||
|
||||
// NewMemory returns an empty in-memory CheckpointStore.
|
||||
func NewMemory() *Memory { return &Memory{cps: map[string]RunCheckpoint{}} }
|
||||
|
||||
var _ CheckpointStore = (*Memory)(nil)
|
||||
|
||||
func (m *Memory) Save(_ context.Context, cp RunCheckpoint) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.cps[cp.Meta.RunID] = cp
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Memory) Load(_ context.Context, runID string) (*RunCheckpoint, error) {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
cp, ok := m.cps[runID]
|
||||
if !ok {
|
||||
return nil, nil // no checkpoint (not an error — the run finished cleanly or never started)
|
||||
}
|
||||
return &cp, nil
|
||||
}
|
||||
|
||||
func (m *Memory) Delete(_ context.Context, runID string) error {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
delete(m.cps, runID)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Memory) ListInterrupted(_ context.Context) ([]RunCheckpoint, error) {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
out := make([]RunCheckpoint, 0, len(m.cps))
|
||||
for _, cp := range m.cps {
|
||||
out = append(out, cp)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
Reference in New Issue
Block a user