// Package llamaswap implements majordomo's provider contract for llama-swap // (https://github.com/mostlygeek/llama-swap), an on-demand model-swapping // proxy that fronts llama.cpp (and stable-diffusion.cpp) servers, loading and // hot-swapping the requested model per request. // // Chat is OpenAI Chat Completions, byte-for-byte: this package does NOT carry // its own chat wire client. Provider.Model delegates to provider/openai // pointed at {baseURL}/v1 (ADR-0007: reuse, don't duplicate). What this // package adds beyond a bare OpenAI-compat endpoint is the "tailored" surface: // // - llama-swap management endpoints exposed as concrete methods — ListModels // (GET /v1/models), Running (GET /running), Unload (POST /api/models/unload) // — which have no place on the canonical llm.Provider interface; // - image generation via the imagegen interface (see image.go); and // - swap-aware defaults: the HTTP client carries NO timeout, because the // first request to an unloaded model blocks while llama-swap spawns the // upstream (its healthCheckTimeout is at least 15s). Bound a call with a // context deadline, never a client timeout. // // DSN form (registered as the "llama-swap" scheme): llama-swap://token@host:port // builds an http:// base URL (llama-swap is local-first; a TLS-fronted instance // can use the openai:// scheme for chat instead). package llamaswap import ( "bytes" "context" "encoding/json" "fmt" "io" "net/http" "strings" "gitea.stevedudenhoeffer.com/steve/majordomo/llm" "gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai" ) // DefaultName is the registry name used when WithName is not given. const DefaultName = "llama-swap" // Provider is a llama-swap client. It satisfies llm.Provider (chat, delegated // to provider/openai) and imagegen.Provider (image generation), and exposes // llama-swap's management endpoints as concrete methods. type Provider struct { name string baseURL string // no trailing slash, no /v1 suffix; e.g. "http://host:port" token string // bearer credential; empty = no auth (local) client *http.Client } // Option configures the provider. type Option func(*Provider) // WithName overrides the registry name (default "llama-swap"). func WithName(name string) Option { return func(p *Provider) { p.name = name } } // WithBaseURL sets the llama-swap base URL (scheme://host[:port]); the /v1 and // management paths are appended internally. A trailing slash is trimmed. func WithBaseURL(u string) Option { return func(p *Provider) { p.baseURL = strings.TrimRight(u, "/") } } // WithToken sets the bearer token (llama-swap API key). Empty means no // Authorization header. func WithToken(token string) Option { return func(p *Provider) { p.token = token } } // WithHTTPClient overrides the HTTP client. Prefer context deadlines over a // client timeout: a cold model swap can legitimately take many seconds. func WithHTTPClient(c *http.Client) Option { return func(p *Provider) { if c != nil { p.client = c } } } // New creates a llama-swap provider. Construction never fails; a missing base // URL surfaces at request time. The default client has no timeout (swap cold // starts); bound calls with a context deadline. func New(opts ...Option) *Provider { p := &Provider{name: DefaultName, client: &http.Client{}} for _, opt := range opts { opt(p) } return p } // Name implements llm.Provider and imagegen.Provider. func (p *Provider) Name() string { return p.name } // BaseURL reports the configured base URL (diagnostics). func (p *Provider) BaseURL() string { return p.baseURL } // Model implements llm.Provider via llama-swap's OpenAI-compatible chat // endpoint, delegating to provider/openai. The id is passed through verbatim // and selects which upstream llama-swap loads. func (p *Provider) Model(id string, opts ...llm.ModelOption) (llm.Model, error) { if p.baseURL == "" { return nil, fmt.Errorf("llama-swap provider %q: no base URL configured (set one via WithBaseURL or an LLM_* env DSN)", p.name) } return p.chatProvider().Model(id, opts...) } // chatProvider builds the OpenAI-compatible client for llama-swap's chat API. // Why a placeholder key when token is empty: the openai client treats a blank // key as a synthetic 401, but a local llama-swap may require no auth at all — // a bearer it ignores is harmless. Why legacy max_tokens: llama.cpp's OpenAI // shim honors "max_tokens", not "max_completion_tokens". func (p *Provider) chatProvider() *openai.Provider { key := p.token if key == "" { key = "no-key" } return openai.New( openai.WithName(p.name), openai.WithBaseURL(p.baseURL+"/v1"), openai.WithAPIKey(key), openai.WithLegacyMaxTokens(), openai.WithHTTPClient(p.client), ) } // --- management endpoints --- // ModelInfo is one entry from llama-swap's GET /v1/models (the OpenAI model // list shape). Fields llama-swap adds beyond these are ignored. type ModelInfo struct { ID string `json:"id"` Object string `json:"object"` OwnedBy string `json:"owned_by"` } // ListModels returns the models llama-swap is configured to serve (GET // /v1/models). Unlisted models are excluded by llama-swap itself. func (p *Provider) ListModels(ctx context.Context) ([]ModelInfo, error) { var out struct { Data []ModelInfo `json:"data"` } if err := p.doJSON(ctx, http.MethodGet, "/v1/models", nil, &out); err != nil { return nil, err } return out.Data, nil } // Running returns llama-swap's currently-loaded models as the raw GET /running // payload. Why raw: llama-swap's /running shape is not a stable, OpenAI-style // contract, so this exposes the endpoint without pinning a schema this package // would have to guess. func (p *Provider) Running(ctx context.Context) (json.RawMessage, error) { var out json.RawMessage if err := p.doJSON(ctx, http.MethodGet, "/running", nil, &out); err != nil { return nil, err } return out, nil } // Unload unloads a running model to free its resources (POST // /api/models/unload/:model). An empty model unloads all running models (POST // /api/models/unload). func (p *Provider) Unload(ctx context.Context, model string) error { path := "/api/models/unload" if model != "" { path += "/" + model } return p.doJSON(ctx, http.MethodPost, path, nil, nil) } // --- shared HTTP helper for management + image endpoints --- // doJSON performs a request to a llama-swap endpoint relative to baseURL, // optionally encoding body and decoding into out (either may be nil). Transport // failures are wrapped raw so llm.Classify still sees the underlying net error; // non-2xx responses become *llm.APIError. func (p *Provider) doJSON(ctx context.Context, method, path string, body, out any) error { var rdr io.Reader if body != nil { b, err := json.Marshal(body) if err != nil { return fmt.Errorf("llama-swap: encode request: %w", err) } rdr = bytes.NewReader(b) } req, err := http.NewRequestWithContext(ctx, method, p.baseURL+path, rdr) if err != nil { return fmt.Errorf("llama-swap: build request: %w", err) } if body != nil { req.Header.Set("Content-Type", "application/json") } if p.token != "" { req.Header.Set("Authorization", "Bearer "+p.token) } resp, err := p.client.Do(req) if err != nil { return fmt.Errorf("llama-swap: do request: %w", err) } defer resp.Body.Close() if resp.StatusCode/100 != 2 { return p.apiError(resp, "") } if out != nil { if err := json.NewDecoder(resp.Body).Decode(out); err != nil { return fmt.Errorf("llama-swap: decode response: %w", err) } } return nil } // apiError converts a non-2xx response into *llm.APIError, tolerating the // OpenAI {"error":{"message",...}} envelope, the Ollama-style {"error":"..."} // string form, and a raw body. func (p *Provider) apiError(resp *http.Response, model string) error { e := &llm.APIError{Provider: p.name, Model: model, Status: resp.StatusCode} body, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) var env struct { Error json.RawMessage `json:"error"` } if json.Unmarshal(body, &env) == nil && len(env.Error) > 0 { var obj struct { Message string `json:"message"` Type string `json:"type"` Code string `json:"code"` } if json.Unmarshal(env.Error, &obj) == nil && (obj.Message != "" || obj.Code != "" || obj.Type != "") { e.Message = obj.Message e.Code = obj.Code if e.Code == "" { e.Code = obj.Type } return e } var msg string if json.Unmarshal(env.Error, &msg) == nil && msg != "" { e.Message = msg return e } } e.Message = strings.TrimSpace(string(body)) return e }