64642c43c4
- Unload: reject model ids containing path separators (/?#) so a model name can't redirect the request to another endpoint; ":" (common in ids) stays verbatim. - doJSON: take a model arg so image/management HTTP errors carry the target id (was always ""); add a base-URL guard so management methods fail clearly instead of building a bare-path request; cap the success-path JSON decode with io.LimitReader (64 MiB) and drain the body when out is nil for conn reuse. - image: reject negative Request.N before sending. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
262 lines
9.5 KiB
Go
262 lines
9.5 KiB
Go
// Package llamaswap implements majordomo's provider contract for llama-swap
|
|
// (https://github.com/mostlygeek/llama-swap), an on-demand model-swapping
|
|
// proxy that fronts llama.cpp (and stable-diffusion.cpp) servers, loading and
|
|
// hot-swapping the requested model per request.
|
|
//
|
|
// Chat is OpenAI Chat Completions, byte-for-byte: this package does NOT carry
|
|
// its own chat wire client. Provider.Model delegates to provider/openai
|
|
// pointed at {baseURL}/v1 (ADR-0007: reuse, don't duplicate). What this
|
|
// package adds beyond a bare OpenAI-compat endpoint is the "tailored" surface:
|
|
//
|
|
// - llama-swap management endpoints exposed as concrete methods — ListModels
|
|
// (GET /v1/models), Running (GET /running), Unload (POST /api/models/unload)
|
|
// — which have no place on the canonical llm.Provider interface;
|
|
// - image generation via the imagegen interface (see image.go); and
|
|
// - swap-aware defaults: the HTTP client carries NO timeout, because the
|
|
// first request to an unloaded model blocks while llama-swap spawns the
|
|
// upstream (its healthCheckTimeout is at least 15s). Bound a call with a
|
|
// context deadline, never a client timeout.
|
|
//
|
|
// DSN form (registered as the "llama-swap" scheme): llama-swap://token@host:port
|
|
// builds an http:// base URL (llama-swap is local-first; a TLS-fronted instance
|
|
// can use the openai:// scheme for chat instead).
|
|
package llamaswap
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
|
|
"gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai"
|
|
)
|
|
|
|
// DefaultName is the registry name used when WithName is not given.
|
|
const DefaultName = "llama-swap"
|
|
|
|
// maxResponseBytes caps the JSON body read on the success path. Generous
|
|
// enough for a multi-image b64 payload, bounded so a hostile/buggy upstream
|
|
// can't make a decode allocate without limit.
|
|
const maxResponseBytes = 64 << 20
|
|
|
|
// Provider is a llama-swap client. It satisfies llm.Provider (chat, delegated
|
|
// to provider/openai) and imagegen.Provider (image generation), and exposes
|
|
// llama-swap's management endpoints as concrete methods.
|
|
type Provider struct {
|
|
name string
|
|
baseURL string // no trailing slash, no /v1 suffix; e.g. "http://host:port"
|
|
token string // bearer credential; empty = no auth (local)
|
|
client *http.Client
|
|
}
|
|
|
|
// Option configures the provider.
|
|
type Option func(*Provider)
|
|
|
|
// WithName overrides the registry name (default "llama-swap").
|
|
func WithName(name string) Option { return func(p *Provider) { p.name = name } }
|
|
|
|
// WithBaseURL sets the llama-swap base URL (scheme://host[:port]); the /v1 and
|
|
// management paths are appended internally. A trailing slash is trimmed.
|
|
func WithBaseURL(u string) Option {
|
|
return func(p *Provider) { p.baseURL = strings.TrimRight(u, "/") }
|
|
}
|
|
|
|
// WithToken sets the bearer token (llama-swap API key). Empty means no
|
|
// Authorization header.
|
|
func WithToken(token string) Option { return func(p *Provider) { p.token = token } }
|
|
|
|
// WithHTTPClient overrides the HTTP client. Prefer context deadlines over a
|
|
// client timeout: a cold model swap can legitimately take many seconds.
|
|
func WithHTTPClient(c *http.Client) Option {
|
|
return func(p *Provider) {
|
|
if c != nil {
|
|
p.client = c
|
|
}
|
|
}
|
|
}
|
|
|
|
// New creates a llama-swap provider. Construction never fails; a missing base
|
|
// URL surfaces at request time. The default client has no timeout (swap cold
|
|
// starts); bound calls with a context deadline.
|
|
func New(opts ...Option) *Provider {
|
|
p := &Provider{name: DefaultName, client: &http.Client{}}
|
|
for _, opt := range opts {
|
|
opt(p)
|
|
}
|
|
return p
|
|
}
|
|
|
|
// Name implements llm.Provider and imagegen.Provider.
|
|
func (p *Provider) Name() string { return p.name }
|
|
|
|
// BaseURL reports the configured base URL (diagnostics).
|
|
func (p *Provider) BaseURL() string { return p.baseURL }
|
|
|
|
// Model implements llm.Provider via llama-swap's OpenAI-compatible chat
|
|
// endpoint, delegating to provider/openai. The id is passed through verbatim
|
|
// and selects which upstream llama-swap loads.
|
|
func (p *Provider) Model(id string, opts ...llm.ModelOption) (llm.Model, error) {
|
|
if p.baseURL == "" {
|
|
return nil, fmt.Errorf("llama-swap provider %q: no base URL configured (set one via WithBaseURL or an LLM_* env DSN)", p.name)
|
|
}
|
|
return p.chatProvider().Model(id, opts...)
|
|
}
|
|
|
|
// chatProvider builds the OpenAI-compatible client for llama-swap's chat API.
|
|
// Why a placeholder key when token is empty: the openai client treats a blank
|
|
// key as a synthetic 401, but a local llama-swap may require no auth at all —
|
|
// a bearer it ignores is harmless. Why legacy max_tokens: llama.cpp's OpenAI
|
|
// shim honors "max_tokens", not "max_completion_tokens".
|
|
func (p *Provider) chatProvider() *openai.Provider {
|
|
key := p.token
|
|
if key == "" {
|
|
key = "no-key"
|
|
}
|
|
return openai.New(
|
|
openai.WithName(p.name),
|
|
openai.WithBaseURL(p.baseURL+"/v1"),
|
|
openai.WithAPIKey(key),
|
|
openai.WithLegacyMaxTokens(),
|
|
openai.WithHTTPClient(p.client),
|
|
)
|
|
}
|
|
|
|
// --- management endpoints ---
|
|
|
|
// ModelInfo is one entry from llama-swap's GET /v1/models (the OpenAI model
|
|
// list shape). Fields llama-swap adds beyond these are ignored.
|
|
type ModelInfo struct {
|
|
ID string `json:"id"`
|
|
Object string `json:"object"`
|
|
OwnedBy string `json:"owned_by"`
|
|
}
|
|
|
|
// ListModels returns the models llama-swap is configured to serve (GET
|
|
// /v1/models). Unlisted models are excluded by llama-swap itself.
|
|
func (p *Provider) ListModels(ctx context.Context) ([]ModelInfo, error) {
|
|
var out struct {
|
|
Data []ModelInfo `json:"data"`
|
|
}
|
|
if err := p.doJSON(ctx, http.MethodGet, "/v1/models", "", nil, &out); err != nil {
|
|
return nil, err
|
|
}
|
|
return out.Data, nil
|
|
}
|
|
|
|
// Running returns llama-swap's currently-loaded models as the raw GET /running
|
|
// payload. Why raw: llama-swap's /running shape is not a stable, OpenAI-style
|
|
// contract, so this exposes the endpoint without pinning a schema this package
|
|
// would have to guess.
|
|
func (p *Provider) Running(ctx context.Context) (json.RawMessage, error) {
|
|
var out json.RawMessage
|
|
if err := p.doJSON(ctx, http.MethodGet, "/running", "", nil, &out); err != nil {
|
|
return nil, err
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// Unload unloads a running model to free its resources (POST
|
|
// /api/models/unload/:model). An empty model unloads all running models (POST
|
|
// /api/models/unload).
|
|
func (p *Provider) Unload(ctx context.Context, model string) error {
|
|
path := "/api/models/unload"
|
|
if model != "" {
|
|
// Why reject rather than percent-escape: llama-swap model ids legitimately
|
|
// contain ":" (e.g. "qwen3:14b"), which is path-legal and must reach the
|
|
// server verbatim; only path-structure characters are dangerous (they'd
|
|
// redirect the request to another endpoint), and those never appear in a
|
|
// real model id.
|
|
if strings.ContainsAny(model, "/?#") {
|
|
return fmt.Errorf("llama-swap: invalid model id %q for unload (contains a path separator)", model)
|
|
}
|
|
path += "/" + model
|
|
}
|
|
return p.doJSON(ctx, http.MethodPost, path, "", nil, nil)
|
|
}
|
|
|
|
// --- shared HTTP helper for management + image endpoints ---
|
|
|
|
// doJSON performs a request to a llama-swap endpoint relative to baseURL,
|
|
// optionally encoding body and decoding into out (either may be nil). model
|
|
// labels the failing target in any *llm.APIError ("" for endpoints that aren't
|
|
// model-specific). Transport failures are wrapped raw so llm.Classify still
|
|
// sees the underlying net error; non-2xx responses become *llm.APIError.
|
|
func (p *Provider) doJSON(ctx context.Context, method, path, model string, body, out any) error {
|
|
if p.baseURL == "" {
|
|
return fmt.Errorf("llama-swap provider %q: no base URL configured (set one via WithBaseURL or an LLM_* env DSN)", p.name)
|
|
}
|
|
var rdr io.Reader
|
|
if body != nil {
|
|
b, err := json.Marshal(body)
|
|
if err != nil {
|
|
return fmt.Errorf("llama-swap: encode request: %w", err)
|
|
}
|
|
rdr = bytes.NewReader(b)
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, method, p.baseURL+path, rdr)
|
|
if err != nil {
|
|
return fmt.Errorf("llama-swap: build request: %w", err)
|
|
}
|
|
if body != nil {
|
|
req.Header.Set("Content-Type", "application/json")
|
|
}
|
|
if p.token != "" {
|
|
req.Header.Set("Authorization", "Bearer "+p.token)
|
|
}
|
|
resp, err := p.client.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("llama-swap: do request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode/100 != 2 {
|
|
return p.apiError(resp, model)
|
|
}
|
|
if out != nil {
|
|
if err := json.NewDecoder(io.LimitReader(resp.Body, maxResponseBytes)).Decode(out); err != nil {
|
|
return fmt.Errorf("llama-swap: decode response: %w", err)
|
|
}
|
|
} else {
|
|
// Drain (bounded) so the connection can be reused.
|
|
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, maxResponseBytes))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// apiError converts a non-2xx response into *llm.APIError, tolerating the
|
|
// OpenAI {"error":{"message",...}} envelope, the Ollama-style {"error":"..."}
|
|
// string form, and a raw body.
|
|
func (p *Provider) apiError(resp *http.Response, model string) error {
|
|
e := &llm.APIError{Provider: p.name, Model: model, Status: resp.StatusCode}
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
|
|
|
|
var env struct {
|
|
Error json.RawMessage `json:"error"`
|
|
}
|
|
if json.Unmarshal(body, &env) == nil && len(env.Error) > 0 {
|
|
var obj struct {
|
|
Message string `json:"message"`
|
|
Type string `json:"type"`
|
|
Code string `json:"code"`
|
|
}
|
|
if json.Unmarshal(env.Error, &obj) == nil && (obj.Message != "" || obj.Code != "" || obj.Type != "") {
|
|
e.Message = obj.Message
|
|
e.Code = obj.Code
|
|
if e.Code == "" {
|
|
e.Code = obj.Type
|
|
}
|
|
return e
|
|
}
|
|
var msg string
|
|
if json.Unmarshal(env.Error, &msg) == nil && msg != "" {
|
|
e.Message = msg
|
|
return e
|
|
}
|
|
}
|
|
e.Message = strings.TrimSpace(string(body))
|
|
return e
|
|
}
|