// Package store provides a SQLite-backed durable queue for foreman jobs and artifacts. // // Why: jobs must survive daemon restarts so async callers and webhooks never lose // work (ADR-0008). SQLite in WAL mode gives durable single-writer/multi-reader // semantics with no external dependencies. // What: opens a SQLite database, runs migrations, and exposes CRUD for jobs and // artifacts. // Test: use t.TempDir() for an isolated DB per test; verify all CRUD operations // and state transitions. package store import ( "database/sql" "encoding/json" "fmt" "time" _ "modernc.org/sqlite" ) // JobState represents the lifecycle state of a job. type JobState string const ( JobStateQueued JobState = "queued" JobStateLoading JobState = "loading" JobStateWorking JobState = "working" JobStateDone JobState = "done" JobStateFailed JobState = "failed" ) // Job represents a queued unit of work. type Job struct { ID string `json:"id"` Model string `json:"model"` Payload json.RawMessage `json:"payload"` State JobState `json:"state"` Result json.RawMessage `json:"result,omitempty"` Error *string `json:"error,omitempty"` Attempt int `json:"attempt"` MaxAttempts int `json:"max_attempts"` StateWebhookURL *string `json:"state_webhook_url,omitempty"` CreatedAt time.Time `json:"created_at"` UpdatedAt time.Time `json:"updated_at"` StartedAt *time.Time `json:"started_at,omitempty"` CompletedAt *time.Time `json:"completed_at,omitempty"` } // Artifact represents a named, typed blob attached to a completed job. type Artifact struct { ID int64 `json:"id"` JobID string `json:"job_id"` Name string `json:"name"` ContentType string `json:"content_type"` Data []byte `json:"-"` Size int64 `json:"size"` CreatedAt time.Time `json:"created_at"` } // Store wraps a SQLite database with job and artifact operations. type Store struct { db *sql.DB } // migration is the DDL that creates the schema. It runs once on Open via // IF NOT EXISTS guards. const migration = ` CREATE TABLE IF NOT EXISTS jobs ( id TEXT PRIMARY KEY, model TEXT NOT NULL, payload BLOB NOT NULL, state TEXT NOT NULL DEFAULT 'queued', result BLOB, error TEXT, attempt INTEGER NOT NULL DEFAULT 0, max_attempts INTEGER NOT NULL DEFAULT 3, state_webhook_url TEXT, created_at DATETIME NOT NULL, updated_at DATETIME NOT NULL, started_at DATETIME, completed_at DATETIME ); CREATE INDEX IF NOT EXISTS idx_jobs_state ON jobs(state); CREATE INDEX IF NOT EXISTS idx_jobs_model_state ON jobs(model, state); CREATE TABLE IF NOT EXISTS artifacts ( id INTEGER PRIMARY KEY AUTOINCREMENT, job_id TEXT NOT NULL REFERENCES jobs(id), name TEXT NOT NULL, content_type TEXT NOT NULL, data BLOB NOT NULL, size INTEGER NOT NULL, created_at DATETIME NOT NULL, UNIQUE(job_id, name) ); ` // Open creates or opens a SQLite database at path, enables WAL mode, and runs // migrations. // // Why: single entry point ensures WAL mode and schema are always applied. // What: opens the DB, sets pragmas, runs CREATE TABLE IF NOT EXISTS. // Test: call Open with a temp dir path, assert no error and that tables exist. func Open(path string) (*Store, error) { // Append pragmas to the DSN so they apply to every connection in the pool. dsn := path + "?_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)&_pragma=busy_timeout(5000)" db, err := sql.Open("sqlite", dsn) if err != nil { return nil, fmt.Errorf("open sqlite %q: %w", path, err) } if _, err := db.Exec(migration); err != nil { db.Close() return nil, fmt.Errorf("run migration: %w", err) } return &Store{db: db}, nil } // Close closes the underlying database connection. func (s *Store) Close() error { return s.db.Close() } // CreateJob inserts a new job into the queue. // // Why: the async /jobs endpoint and the sync passthrough both need to enqueue work. // What: inserts a job row with state "queued" and returns the stored Job. // Test: create a job, then GetJob by ID, assert fields match. func (s *Store) CreateJob(job Job) (Job, error) { now := time.Now().UTC() job.State = JobStateQueued job.CreatedAt = now job.UpdatedAt = now if job.MaxAttempts == 0 { job.MaxAttempts = 3 } _, err := s.db.Exec( `INSERT INTO jobs (id, model, payload, state, attempt, max_attempts, state_webhook_url, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, job.ID, job.Model, []byte(job.Payload), string(job.State), job.Attempt, job.MaxAttempts, job.StateWebhookURL, job.CreatedAt, job.UpdatedAt, ) if err != nil { return Job{}, fmt.Errorf("insert job %s: %w", job.ID, err) } return job, nil } // GetJob retrieves a job by ID. // // Why: callers need to poll job status via GET /jobs/{id} and the worker needs to // read jobs from the queue. // What: queries the jobs table by primary key and scans into a Job struct. // Test: create a job, GetJob, assert all fields round-trip correctly. func (s *Store) GetJob(id string) (Job, error) { var j Job var payload, result []byte err := s.db.QueryRow( `SELECT id, model, payload, state, result, error, attempt, max_attempts, state_webhook_url, created_at, updated_at, started_at, completed_at FROM jobs WHERE id = ?`, id, ).Scan( &j.ID, &j.Model, &payload, &j.State, &result, &j.Error, &j.Attempt, &j.MaxAttempts, &j.StateWebhookURL, &j.CreatedAt, &j.UpdatedAt, &j.StartedAt, &j.CompletedAt, ) if err != nil { return Job{}, fmt.Errorf("get job %s: %w", id, err) } j.Payload = json.RawMessage(payload) if result != nil { j.Result = json.RawMessage(result) } return j, nil } // UpdateJobState transitions a job to a new state and updates associated fields. // // Why: the worker loop drives jobs through their lifecycle (queued -> loading -> // working -> done/failed), and each transition must be persisted durably. // What: updates the state, updated_at, and optionally result/error/timestamps. // Test: create a job, advance through states, assert each transition persists. func (s *Store) UpdateJobState(id string, state JobState, result json.RawMessage, errMsg *string) error { now := time.Now().UTC() var resultBytes []byte if result != nil { resultBytes = []byte(result) } var startedAt, completedAt *time.Time switch state { case JobStateLoading, JobStateWorking: startedAt = &now case JobStateDone, JobStateFailed: completedAt = &now } res, err := s.db.Exec( `UPDATE jobs SET state = ?, result = ?, error = ?, updated_at = ?, started_at = COALESCE(?, started_at), completed_at = COALESCE(?, completed_at) WHERE id = ?`, string(state), resultBytes, errMsg, now, startedAt, completedAt, id, ) if err != nil { return fmt.Errorf("update job %s state to %s: %w", id, state, err) } rows, err := res.RowsAffected() if err != nil { return fmt.Errorf("check rows affected for job %s: %w", id, err) } if rows == 0 { return fmt.Errorf("job %s not found", id) } return nil } // ListJobs returns jobs, optionally filtered by state. If state is nil, all jobs // are returned ordered by created_at descending. // // Why: the GET /jobs endpoint needs to list jobs with optional state filtering. // What: queries the jobs table with an optional WHERE clause on state. // Test: create jobs in different states, list with and without filter, assert counts. func (s *Store) ListJobs(state *JobState) ([]Job, error) { var rows *sql.Rows var err error if state != nil { rows, err = s.db.Query( `SELECT id, model, payload, state, result, error, attempt, max_attempts, state_webhook_url, created_at, updated_at, started_at, completed_at FROM jobs WHERE state = ? ORDER BY created_at DESC`, string(*state), ) } else { rows, err = s.db.Query( `SELECT id, model, payload, state, result, error, attempt, max_attempts, state_webhook_url, created_at, updated_at, started_at, completed_at FROM jobs ORDER BY created_at DESC`, ) } if err != nil { return nil, fmt.Errorf("list jobs: %w", err) } defer rows.Close() var jobs []Job for rows.Next() { var j Job var payload, result []byte if err := rows.Scan( &j.ID, &j.Model, &payload, &j.State, &result, &j.Error, &j.Attempt, &j.MaxAttempts, &j.StateWebhookURL, &j.CreatedAt, &j.UpdatedAt, &j.StartedAt, &j.CompletedAt, ); err != nil { return nil, fmt.Errorf("scan job row: %w", err) } j.Payload = json.RawMessage(payload) if result != nil { j.Result = json.RawMessage(result) } jobs = append(jobs, j) } return jobs, rows.Err() } // CreateArtifact attaches a named artifact to a job. // // Why: completed jobs produce artifacts (the completion response, structured data, // etc.) that must be stored durably for webhook delivery and polling (ADR-0006). // What: inserts a row into the artifacts table with the blob data. // Test: create a job, attach an artifact, retrieve it, assert data matches. func (s *Store) CreateArtifact(artifact Artifact) (Artifact, error) { now := time.Now().UTC() artifact.CreatedAt = now artifact.Size = int64(len(artifact.Data)) res, err := s.db.Exec( `INSERT INTO artifacts (job_id, name, content_type, data, size, created_at) VALUES (?, ?, ?, ?, ?, ?)`, artifact.JobID, artifact.Name, artifact.ContentType, artifact.Data, artifact.Size, artifact.CreatedAt, ) if err != nil { return Artifact{}, fmt.Errorf("insert artifact %q for job %s: %w", artifact.Name, artifact.JobID, err) } id, err := res.LastInsertId() if err != nil { return Artifact{}, fmt.Errorf("get artifact id: %w", err) } artifact.ID = id return artifact, nil } // GetArtifact retrieves a single artifact by job ID and name. // // Why: the GET /jobs/{id}/artifacts/{name} endpoint serves individual artifacts. // What: queries by the (job_id, name) unique key and returns the full blob. // Test: create an artifact, get it by job_id+name, assert data round-trips. func (s *Store) GetArtifact(jobID, name string) (Artifact, error) { var a Artifact err := s.db.QueryRow( `SELECT id, job_id, name, content_type, data, size, created_at FROM artifacts WHERE job_id = ? AND name = ?`, jobID, name, ).Scan(&a.ID, &a.JobID, &a.Name, &a.ContentType, &a.Data, &a.Size, &a.CreatedAt) if err != nil { return Artifact{}, fmt.Errorf("get artifact %q for job %s: %w", name, jobID, err) } return a, nil } // GetArtifactsByJob returns all artifacts for a given job. // // Why: the GET /jobs/{id} response includes artifact metadata for the caller to // decide which to fetch. // What: queries all artifacts by job_id, ordered by name. // Test: attach multiple artifacts to a job, list them, assert all returned. func (s *Store) GetArtifactsByJob(jobID string) ([]Artifact, error) { rows, err := s.db.Query( `SELECT id, job_id, name, content_type, data, size, created_at FROM artifacts WHERE job_id = ? ORDER BY name`, jobID, ) if err != nil { return nil, fmt.Errorf("list artifacts for job %s: %w", jobID, err) } defer rows.Close() var artifacts []Artifact for rows.Next() { var a Artifact if err := rows.Scan(&a.ID, &a.JobID, &a.Name, &a.ContentType, &a.Data, &a.Size, &a.CreatedAt); err != nil { return nil, fmt.Errorf("scan artifact row: %w", err) } artifacts = append(artifacts, a) } return artifacts, rows.Err() } // NextJob returns the next queued job using drain-by-model ordering. Jobs for the // currently-resident model are preferred to avoid swap costs, then ordered by // creation time. // // Why: the worker loop must pick the optimal next job to minimize model swaps // (ADR-0009 drain-by-model heuristic). // What: queries for the first queued job, sorting by model affinity then FIFO. // Test: enqueue jobs for two models, set currentModel to one, verify it drains // that model first before switching. func (s *Store) NextJob(currentModel string) (Job, error) { var j Job var payload, result []byte err := s.db.QueryRow( `SELECT id, model, payload, state, result, error, attempt, max_attempts, state_webhook_url, created_at, updated_at, started_at, completed_at FROM jobs WHERE state = ? ORDER BY (CASE WHEN model = ? THEN 0 ELSE 1 END) ASC, created_at ASC LIMIT 1`, string(JobStateQueued), currentModel, ).Scan( &j.ID, &j.Model, &payload, &j.State, &result, &j.Error, &j.Attempt, &j.MaxAttempts, &j.StateWebhookURL, &j.CreatedAt, &j.UpdatedAt, &j.StartedAt, &j.CompletedAt, ) if err != nil { return Job{}, fmt.Errorf("next job: %w", err) } j.Payload = json.RawMessage(payload) if result != nil { j.Result = json.RawMessage(result) } return j, nil } // IncrementAttempt bumps the attempt counter on a job and resets it to queued. // // Why: retry logic needs to record each attempt while re-queuing the job. // What: increments attempt by 1 and sets state back to queued. // Test: create a job, increment twice, verify attempt=2 and state=queued. func (s *Store) IncrementAttempt(id string) error { now := time.Now().UTC() res, err := s.db.Exec( `UPDATE jobs SET attempt = attempt + 1, state = ?, updated_at = ? WHERE id = ?`, string(JobStateQueued), now, id, ) if err != nil { return fmt.Errorf("increment attempt for job %s: %w", id, err) } rows, err := res.RowsAffected() if err != nil { return fmt.Errorf("check rows affected for job %s: %w", id, err) } if rows == 0 { return fmt.Errorf("job %s not found", id) } return nil } // ResetInterruptedJobs moves any loading or working jobs back to queued. Called // on startup to recover from a crash mid-execution. // // Why: if the daemon restarts while a job is in-flight, the job must not be stuck // in a non-terminal, non-queued state forever. // What: updates all loading/working jobs to queued. // Test: create jobs in loading/working states, call Reset, verify all are queued. func (s *Store) ResetInterruptedJobs() (int64, error) { now := time.Now().UTC() res, err := s.db.Exec( `UPDATE jobs SET state = ?, updated_at = ? WHERE state IN (?, ?)`, string(JobStateQueued), now, string(JobStateLoading), string(JobStateWorking), ) if err != nil { return 0, fmt.Errorf("reset interrupted jobs: %w", err) } rows, err := res.RowsAffected() if err != nil { return 0, fmt.Errorf("check rows affected: %w", err) } return rows, nil } // DeleteTerminalJobsBefore deletes terminal jobs (done or failed) and their // artifacts older than the given cutoff time. // // Why: prevents unbounded storage growth by pruning old completed work (ADR-0008). // What: deletes artifacts first (FK), then jobs with completed_at before cutoff. // Test: create old terminal jobs, call with a recent cutoff, verify they are gone. func (s *Store) DeleteTerminalJobsBefore(cutoff time.Time) (int64, error) { // Delete artifacts for terminal jobs first (foreign key). _, err := s.db.Exec( `DELETE FROM artifacts WHERE job_id IN ( SELECT id FROM jobs WHERE state IN (?, ?) AND completed_at < ? )`, string(JobStateDone), string(JobStateFailed), cutoff, ) if err != nil { return 0, fmt.Errorf("delete old artifacts: %w", err) } res, err := s.db.Exec( `DELETE FROM jobs WHERE state IN (?, ?) AND completed_at < ?`, string(JobStateDone), string(JobStateFailed), cutoff, ) if err != nil { return 0, fmt.Errorf("delete old jobs: %w", err) } rows, err := res.RowsAffected() if err != nil { return 0, fmt.Errorf("check rows affected: %w", err) } return rows, nil }