executus/tool/encryption.go

// Package skilltools — encryption.go: per-skill envelope encryption for
// KV values and file blobs. AES-256-GCM with a per-skill key derived
// from a single master key (env var SKILLS_ENCRYPTION_MASTER_KEY) via
// HKDF using the skill ID as the salt.
//
// !!!!! CRITICAL OPERATIONAL WARNING !!!!!
//
// SKILLS_ENCRYPTION_MASTER_KEY MUST BE BACKED UP SEPARATELY FROM THE
// DATABASE. Losing the master key = losing every byte of encrypted
// KV value and every encrypted file blob, with no recovery path. The
// key is the ONLY thing that can decrypt rows whose
// encryption_key_version > 0.
//
// Operational rules:
//   - Store the master key in a secrets manager (Vault, 1Password,
//     KMS export) — NEVER in the same backup as the database dump.
//   - Rotating the master key without a versioned re-encrypt
//     migration WILL render existing encrypted rows unreadable. The
//     encryption_key_version column was added so a future rotation
//     migration can re-encrypt under a new (master, version)
//     pair; do not bump the version without that migration.
//   - When the env var is empty, encryption is OFF for the whole
//     instance. Skills with encryption_enabled=true still write
//     plaintext (with a logged WARNING). This is intentional — the
//     alternative is to refuse to start, which would break
//     deployment for everyone the moment the secret leaks during
//     rotation. Loud logging + the boot-time warning in mort.go is
//     the correct trade-off.
//
// Why HKDF-derived per-skill keys (vs one global key): a future
// "wipe this skill's data" admin action can be made auditable by
// recording the skill_id in the operation log without exposing the
// master key. Per-skill keys also cap blast radius if one key
// somehow leaks via a side channel — only that one skill's data is
// compromised, not the whole platform.
//
// Why AES-256-GCM: authenticated encryption catches tampered
// ciphertext at decrypt time. The GCM nonce is 12 random bytes per
// row; the auth tag is 16 bytes. Both are stored inline with the
// ciphertext so the storage layer's value/content column holds the
// full envelope (no separate nonce column).
//
// Wire format of an encrypted blob:
//
//	+-- 1 byte: format version (0x01)
//	+-- 12 bytes: GCM nonce
//	+-- N bytes: ciphertext + 16-byte GCM tag
//
// The format-version byte lets a future change to nonce length or
// auth tag handling be detected loudly rather than corrupting reads.
// Encrypt always writes 0x01; Decrypt rejects any other version with
// ErrEncryptionUnknownVersion.
package tool

import (
	"crypto/aes"
	"crypto/cipher"
	"crypto/rand"
	"crypto/sha256"
	"errors"
	"fmt"
	"io"
	"os"

	"golang.org/x/crypto/hkdf"
)

// EncryptionMasterKeyEnv is the environment variable that holds the
// 32-byte (or longer, hashed down) master key for skill envelope
// encryption.
//
// !!!!! LOSING THIS KEY = LOSING ALL ENCRYPTED DATA !!!!!
//
// Back it up separately from database backups. Never commit it.
// Empty value = encryption OFF (with WARNING logged at boot).
const EncryptionMasterKeyEnv = "SKILLS_ENCRYPTION_MASTER_KEY"

// CurrentKeyVersion is the version stamped on every newly-encrypted
// row. Version 0 is reserved for plaintext (legacy / encryption-off).
// Version 1 is "AES-256-GCM with HKDF(master, skill_id) per-skill key,
// envelope format 0x01". Bumping this requires a migration that
// re-encrypts existing rows under the new (master, version) pair.
const CurrentKeyVersion = 1

// envelopeFormatV1 is the first byte of every Encrypt output. Decrypt
// rejects any other value with ErrEncryptionUnknownVersion.
const envelopeFormatV1 = byte(0x01)

// gcmNonceSize is fixed at 12 bytes for AES-GCM (NIST SP 800-38D
// recommended).
const gcmNonceSize = 12

// Encryption sentinel errors. Callers compare with errors.Is so storage
// adapters can branch on "tampered" vs "unknown version" vs "no master
// key".
var (
	// ErrEncryptionDisabled is returned when an encryption operation
	// is attempted but SKILLS_ENCRYPTION_MASTER_KEY is empty. Storage
	// adapters interpret this as "fall through to plaintext" — they
	// MUST log loudly when this branch is taken.
	ErrEncryptionDisabled = errors.New("skilltools: encryption disabled (master key empty)")

	// ErrEncryptionUnknownVersion is returned by Decrypt when the
	// envelope's format-version byte is not envelopeFormatV1. A read
	// that hits this error is corruption — surface to the operator,
	// do NOT silently fall back to plaintext.
	ErrEncryptionUnknownVersion = errors.New("skilltools: encryption envelope has unknown format version")

	// ErrEncryptionTampered is returned by Decrypt when the GCM auth
	// tag check fails. The ciphertext or nonce was modified after
	// encryption. Surface as "data corruption" — the row is unreadable.
	ErrEncryptionTampered = errors.New("skilltools: encryption auth tag mismatch (data corruption or wrong key)")

	// ErrEncryptionShortInput is returned by Decrypt when the input
	// is too short to contain even the version byte + nonce. Bug or
	// malformed write.
	ErrEncryptionShortInput = errors.New("skilltools: encryption input too short")
)

// MasterKeyFromEnv returns the master key bytes (raw, NOT
// HKDF-derived) from the SKILLS_ENCRYPTION_MASTER_KEY env var.
//
// Why hash + truncate to 32 bytes vs require 32 raw bytes: operators
// commonly paste a generated random hex/base64 string of varying
// length. SHA-256-truncate accepts any non-empty input and produces
// a fixed-length key, which is then fed into HKDF for per-skill
// derivation. The hash step is purely "normalize length"; HKDF still
// does the per-skill diversification.
//
// Returns nil bytes (and false) if the env var is empty.
func MasterKeyFromEnv() (key []byte, present bool) {
	raw := os.Getenv(EncryptionMasterKeyEnv)
	if raw == "" {
		return nil, false
	}
	sum := sha256.Sum256([]byte(raw))
	return sum[:], true
}

// DeriveSkillKey returns the per-skill 32-byte AES-256 key for the
// given (master, skillID) pair via HKDF-SHA256.
//
// Why skillID as HKDF salt: each skill gets a distinct subkey so a
// single master breach is necessary to decrypt any one skill, but
// a skill_id leak (which is normal — IDs appear in logs) does NOT
// help an attacker. The HKDF info parameter is fixed to a constant
// label so different uses of the same master+skillID pair (e.g. a
// future per-skill HMAC key) can be derived with a different label
// without colliding.
//
// master must be the 32-byte output of MasterKeyFromEnv (or
// equivalent length-normalized input). skillID must be non-empty —
// caller is responsible.
func DeriveSkillKey(master []byte, skillID string) ([]byte, error) {
	if len(master) == 0 {
		return nil, ErrEncryptionDisabled
	}
	if skillID == "" {
		return nil, errors.New("skilltools: DeriveSkillKey requires non-empty skillID")
	}
	r := hkdf.New(sha256.New, master, []byte(skillID), []byte("mort/skills/v1/aead"))
	out := make([]byte, 32)
	if _, err := io.ReadFull(r, out); err != nil {
		return nil, fmt.Errorf("skilltools: HKDF derive: %w", err)
	}
	return out, nil
}

// Encrypt seals plaintext under skillKey using AES-256-GCM and returns
// the wire envelope (version byte || nonce || ciphertext || tag).
//
// Caller is responsible for stamping the encryption_key_version column
// to CurrentKeyVersion AFTER a successful Encrypt — Encrypt itself
// only produces bytes; persisting them is the storage layer's job.
//
// Why a fresh random nonce per call (vs deterministic): nonce reuse
// under GCM is catastrophic (allows recovering the keystream); fresh
// 96-bit random nonces have a negligible collision probability under
// any realistic write rate.
func Encrypt(skillKey, plaintext []byte) ([]byte, error) {
	if len(skillKey) != 32 {
		return nil, fmt.Errorf("skilltools: Encrypt requires 32-byte key, got %d", len(skillKey))
	}
	block, err := aes.NewCipher(skillKey)
	if err != nil {
		return nil, fmt.Errorf("skilltools: aes.NewCipher: %w", err)
	}
	gcm, err := cipher.NewGCM(block)
	if err != nil {
		return nil, fmt.Errorf("skilltools: cipher.NewGCM: %w", err)
	}
	nonce := make([]byte, gcmNonceSize)
	if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
		return nil, fmt.Errorf("skilltools: rand.Read: %w", err)
	}
	// Pre-allocate the envelope: 1 (version) + 12 (nonce) + len(plaintext) + 16 (tag).
	out := make([]byte, 0, 1+gcmNonceSize+len(plaintext)+gcm.Overhead())
	out = append(out, envelopeFormatV1)
	out = append(out, nonce...)
	out = gcm.Seal(out, nonce, plaintext, nil)
	return out, nil
}

// Decrypt opens an envelope produced by Encrypt under the same
// skillKey. Returns the plaintext or one of the sentinel errors.
//
// Caller MUST inspect the storage row's encryption_key_version BEFORE
// calling Decrypt. Version 0 means plaintext — Decrypt SHOULD NOT be
// called for version-0 rows (callers branch on the column value).
// This function does NOT inspect any version column; it only looks at
// the in-band envelope-format byte.
func Decrypt(skillKey, envelope []byte) ([]byte, error) {
	if len(skillKey) != 32 {
		return nil, fmt.Errorf("skilltools: Decrypt requires 32-byte key, got %d", len(skillKey))
	}
	if len(envelope) < 1+gcmNonceSize {
		return nil, ErrEncryptionShortInput
	}
	if envelope[0] != envelopeFormatV1 {
		return nil, ErrEncryptionUnknownVersion
	}
	nonce := envelope[1 : 1+gcmNonceSize]
	ciphertext := envelope[1+gcmNonceSize:]
	block, err := aes.NewCipher(skillKey)
	if err != nil {
		return nil, fmt.Errorf("skilltools: aes.NewCipher: %w", err)
	}
	gcm, err := cipher.NewGCM(block)
	if err != nil {
		return nil, fmt.Errorf("skilltools: cipher.NewGCM: %w", err)
	}
	plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
	if err != nil {
		// Distinguish auth-tag mismatch from other crypto errors so
		// callers can surface "data corruption" specifically. The
		// stdlib wraps the failure as a generic error; we map any
		// failure here to ErrEncryptionTampered (the most likely
		// cause is wrong key / tampered bytes).
		return nil, ErrEncryptionTampered
	}
	return plaintext, nil
}