majordomo/provider/ollama/wire.go

package ollama

import (
	"bytes"
	"context"
	"encoding/base64"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"strconv"
	"strings"

	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)

// ---- wire types (field names per ollama api/types.go) ----

type chatRequest struct {
	Model    string          `json:"model"`
	Messages []chatMessage   `json:"messages"`
	Tools    []toolDef       `json:"tools,omitempty"`
	Format   json.RawMessage `json:"format,omitempty"`
	Options  map[string]any  `json:"options,omitempty"`
	// Stream has no omitempty on purpose: the server default is true, so
	// Generate must send an explicit false.
	Stream bool `json:"stream"`
	// Think is bool-or-string on the wire ("low"/"medium"/"high" or a bool).
	Think json.RawMessage `json:"think,omitempty"`
}

type chatMessage struct {
	Role      string     `json:"role"`
	Content   string     `json:"content"`
	Images    []string   `json:"images,omitempty"` // raw base64, no data: prefix
	ToolCalls []toolCall `json:"tool_calls,omitempty"`
	ToolName  string     `json:"tool_name,omitempty"` // on role:"tool" results
}

type toolDef struct {
	Type     string      `json:"type"`
	Function toolDefFunc `json:"function"`
}

type toolDefFunc struct {
	Name        string          `json:"name"`
	Description string          `json:"description,omitempty"`
	Parameters  json.RawMessage `json:"parameters,omitempty"`
}

type toolCall struct {
	ID       string       `json:"id,omitempty"`
	Function toolCallFunc `json:"function"`
}

type toolCallFunc struct {
	Index int    `json:"index,omitempty"`
	Name  string `json:"name"`
	// Arguments is a JSON OBJECT on the wire (unlike OpenAI's string).
	Arguments json.RawMessage `json:"arguments"`
}

type chatResponse struct {
	Model           string      `json:"model"`
	Message         respMessage `json:"message"`
	Done            bool        `json:"done"`
	DoneReason      string      `json:"done_reason"`
	PromptEvalCount int         `json:"prompt_eval_count"`
	EvalCount       int         `json:"eval_count"`
}

type respMessage struct {
	Role      string     `json:"role"`
	Content   string     `json:"content"`
	Thinking  string     `json:"thinking"`
	ToolCalls []toolCall `json:"tool_calls"`
}

type errorBody struct {
	Error string `json:"error"`
}

// ---- model ----

type model struct {
	provider *Provider
	id       string
	caps     llm.Capabilities
}

func (m *model) Capabilities() llm.Capabilities { return m.caps }

func (m *model) qualified() string { return m.provider.name + "/" + m.id }

// enforceCapabilities is the backstop check (the media layer normalizes
// before requests get here; see ADR-0009).
func (m *model) enforceCapabilities(req llm.Request) error {
	count := 0
	for _, msg := range req.Messages {
		for _, part := range msg.Parts {
			img, ok := part.(llm.ImagePart)
			if !ok {
				continue
			}
			count++
			if !m.caps.SupportsImages() {
				return fmt.Errorf("%w: %s does not accept image input", llm.ErrUnsupported, m.qualified())
			}
			if !m.caps.MIMEAllowed(img.MIME) {
				return fmt.Errorf("%w: %s does not accept %s images", llm.ErrUnsupported, m.qualified(), img.MIME)
			}
			if m.caps.MaxImageBytes > 0 && len(img.Data) > m.caps.MaxImageBytes {
				return fmt.Errorf("%w: image of %d bytes exceeds %s limit of %d",
					llm.ErrUnsupported, len(img.Data), m.qualified(), m.caps.MaxImageBytes)
			}
		}
	}
	if count > 0 && m.caps.MaxImagesPerReq > 0 && count > m.caps.MaxImagesPerReq {
		return fmt.Errorf("%w: %d images exceed %s limit of %d",
			llm.ErrUnsupported, count, m.qualified(), m.caps.MaxImagesPerReq)
	}
	return nil
}

// buildRequest maps the canonical request onto the wire shape.
func (m *model) buildRequest(req llm.Request, stream bool) (*chatRequest, error) {
	out := &chatRequest{Model: m.id, Stream: stream}

	// System prompt: dedicated field first, then folded RoleSystem messages.
	var sys []string
	if req.System != "" {
		sys = append(sys, req.System)
	}
	for _, msg := range req.Messages {
		if msg.Role == llm.RoleSystem {
			if t := msg.Text(); t != "" {
				sys = append(sys, t)
			}
		}
	}
	if len(sys) > 0 {
		out.Messages = append(out.Messages, chatMessage{
			Role: "system", Content: strings.Join(sys, "\n\n"),
		})
	}

	for _, msg := range req.Messages {
		switch msg.Role {
		case llm.RoleSystem:
			// Already folded above.
		case llm.RoleTool:
			for _, res := range msg.ToolResults {
				content := res.Content
				if res.IsError {
					content = "ERROR: " + content
				}
				out.Messages = append(out.Messages, chatMessage{
					Role: "tool", Content: content, ToolName: res.Name,
				})
			}
		default:
			cm := chatMessage{Role: string(msg.Role), Content: msg.Text()}
			for _, part := range msg.Parts {
				if img, ok := part.(llm.ImagePart); ok {
					cm.Images = append(cm.Images, base64.StdEncoding.EncodeToString(img.Data))
				}
			}
			for _, tc := range msg.ToolCalls {
				args := tc.Arguments
				if len(args) == 0 {
					args = json.RawMessage("{}")
				}
				cm.ToolCalls = append(cm.ToolCalls, toolCall{
					ID:       tc.ID,
					Function: toolCallFunc{Name: tc.Name, Arguments: args},
				})
			}
			out.Messages = append(out.Messages, cm)
		}
	}

	// Tools. Ollama has no tool_choice: "none" maps to omitting the tools;
	// "required"/named choices have no wire equivalent and are best-effort
	// ignored (documented in the README support matrix).
	if req.ToolChoice != "none" {
		for _, t := range req.Tools {
			params := t.Parameters
			if len(params) == 0 {
				params = json.RawMessage(`{"type":"object","properties":{}}`)
			}
			out.Tools = append(out.Tools, toolDef{
				Type:     "function",
				Function: toolDefFunc{Name: t.Name, Description: t.Description, Parameters: params},
			})
		}
	}

	if len(req.Schema) > 0 {
		out.Format = req.Schema
	}

	opts := make(map[string]any)
	if req.Temperature != nil {
		opts["temperature"] = *req.Temperature
	}
	if req.TopP != nil {
		opts["top_p"] = *req.TopP
	}
	if req.MaxTokens > 0 {
		opts["num_predict"] = req.MaxTokens
	}
	if len(req.StopSequences) > 0 {
		opts["stop"] = req.StopSequences
	}
	if len(opts) > 0 {
		out.Options = opts
	}

	switch req.ReasoningEffort {
	case "":
	case "low", "medium", "high":
		out.Think = json.RawMessage(strconv.Quote(req.ReasoningEffort))
	default:
		return nil, fmt.Errorf("ollama: invalid reasoning effort %q (want low/medium/high)", req.ReasoningEffort)
	}

	return out, nil
}

// do POSTs /api/chat and returns the response body on 2xx, or a classified
// error.
func (m *model) do(ctx context.Context, wireReq *chatRequest) (*http.Response, error) {
	p := m.provider
	if err := p.checkReady(); err != nil {
		return nil, err
	}
	body, err := json.Marshal(wireReq)
	if err != nil {
		return nil, fmt.Errorf("ollama: encode request: %w", err)
	}
	httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, p.baseURL+"/api/chat", bytes.NewReader(body))
	if err != nil {
		return nil, fmt.Errorf("ollama: build request: %w", err)
	}
	httpReq.Header.Set("Content-Type", "application/json")
	if p.token != "" {
		httpReq.Header.Set("Authorization", "Bearer "+p.token)
	}

	resp, err := p.client.Do(httpReq)
	if err != nil {
		return nil, fmt.Errorf("ollama %s: do request: %w", m.qualified(), err)
	}
	if resp.StatusCode/100 != 2 {
		defer resp.Body.Close()
		raw, _ := io.ReadAll(io.LimitReader(resp.Body, 8<<10))
		var eb errorBody
		_ = json.Unmarshal(raw, &eb)
		msg := eb.Error
		if msg == "" {
			msg = strings.TrimSpace(string(raw))
		}
		return nil, &llm.APIError{
			Provider: p.name, Model: m.id,
			Status: resp.StatusCode, Message: msg,
		}
	}
	return resp, nil
}

// Generate implements llm.Model.
func (m *model) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
	req = req.Apply(opts...)
	if err := m.enforceCapabilities(req); err != nil {
		return nil, err
	}
	wireReq, err := m.buildRequest(req, false)
	if err != nil {
		return nil, err
	}
	resp, err := m.do(ctx, wireReq)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	var cr chatResponse
	if err := json.NewDecoder(resp.Body).Decode(&cr); err != nil {
		return nil, fmt.Errorf("ollama %s: decode response: %w", m.qualified(), err)
	}
	return m.toResponse(&cr), nil
}

// toResponse converts a final wire chunk into the canonical response.
func (m *model) toResponse(cr *chatResponse) *llm.Response {
	out := &llm.Response{
		Model: m.qualified(),
		Usage: llm.Usage{InputTokens: cr.PromptEvalCount, OutputTokens: cr.EvalCount},
		Raw:   cr,
	}
	if cr.Message.Content != "" {
		out.Parts = append(out.Parts, llm.Text(cr.Message.Content))
	}
	out.ToolCalls = convertToolCalls(cr.Message.ToolCalls)
	out.FinishReason = finishReason(cr.DoneReason, len(out.ToolCalls) > 0)
	return out
}

// convertToolCalls maps wire tool calls, synthesizing ids where the model
// omitted them (ids are optional in Ollama's shape but required by our
// agent loop to match results to calls).
func convertToolCalls(calls []toolCall) []llm.ToolCall {
	out := make([]llm.ToolCall, 0, len(calls))
	for i, tc := range calls {
		id := tc.ID
		if id == "" {
			id = "call_" + strconv.Itoa(i)
		}
		args := tc.Function.Arguments
		if len(args) == 0 {
			args = json.RawMessage("{}")
		}
		out = append(out, llm.ToolCall{ID: id, Name: tc.Function.Name, Arguments: args})
	}
	if len(out) == 0 {
		return nil
	}
	return out
}

func finishReason(doneReason string, hasToolCalls bool) llm.FinishReason {
	if hasToolCalls {
		return llm.FinishToolCalls
	}
	switch doneReason {
	case "stop", "":
		return llm.FinishStop
	case "length":
		return llm.FinishLength
	default:
		return llm.FinishOther
	}
}