feat: OpenAI, Anthropic, and native-Ollama providers + media pipeline

Phase 3:
- provider/openai: Chat Completions for OpenAI + compat endpoints (SSE
  streaming with by-index tool-call assembly, response_format json_schema,
  legacy max_tokens option, reasoning_effort)
- provider/anthropic: Messages API (tool_use/tool_result, GA structured
  output via output_config.format, full SSE event parser, 529 transient)
- provider/ollama: one native /api/chat client behind the ollama,
  ollama-cloud, and foreman built-ins (presets; NDJSON streaming tolerant
  of foreman's buffered single-object responses; object tool arguments;
  format-schema structured output; think mapping)
- media/: capability normalization (sniff, downscale, transcode, byte
  ladder, ErrUnsupported), wired into the chain executor per target with
  penalty-free advance past incapable elements
- registry: real provider + scheme wiring, WithHTTPClient option, required
  env-foreman TLS chat round-trip test
- ADR-0009 multimodal strategy, ADR-0010 tools/structured mapping; README
  matrix + CLAUDE.md synced

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 12:58:08 +02:00
parent 323558ed72
commit 043249e0e1
31 changed files with 6194 additions and 74 deletions
+140
View File
@@ -0,0 +1,140 @@
package ollama
import (
"bufio"
"context"
"encoding/json"
"fmt"
"io"
"strconv"
"sync"
"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
)
// Stream implements llm.Model over Ollama's NDJSON streaming. It also
// transparently handles foreman's non-streaming degradation (a single
// buffered JSON object): one JSON line parses as the final chunk.
func (m *model) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
req = req.Apply(opts...)
if err := m.enforceCapabilities(req); err != nil {
return nil, err
}
wireReq, err := m.buildRequest(req, true)
if err != nil {
return nil, err
}
resp, err := m.do(ctx, wireReq)
if err != nil {
return nil, err
}
sc := bufio.NewScanner(resp.Body)
// Single NDJSON lines can far exceed the 64KB default (thinking dumps,
// tool payloads, foreman's whole-response-as-one-line degradation).
sc.Buffer(make([]byte, 64<<10), 16<<20)
return &stream{model: m, body: resp.Body, scanner: sc}, nil
}
type stream struct {
model *model
body io.Closer
scanner *bufio.Scanner
mu sync.Mutex
closed bool
finished bool
toolCalls []llm.ToolCall
text []byte
pending []llm.StreamEvent
usage llm.Usage
doneReason string
}
func (s *stream) Next() (llm.StreamEvent, error) {
s.mu.Lock()
defer s.mu.Unlock()
for {
if len(s.pending) > 0 {
ev := s.pending[0]
s.pending = s.pending[1:]
return ev, nil
}
if s.finished {
return llm.StreamEvent{}, io.EOF
}
if !s.scanner.Scan() {
if err := s.scanner.Err(); err != nil {
return llm.StreamEvent{}, fmt.Errorf("ollama %s: read stream: %w", s.model.qualified(), err)
}
// EOF without a done chunk: synthesize the final response from
// what we accumulated rather than losing it.
s.queueFinal()
continue
}
line := s.scanner.Bytes()
if len(line) == 0 {
continue
}
var chunk chatResponse
if err := json.Unmarshal(line, &chunk); err != nil {
return llm.StreamEvent{}, fmt.Errorf("ollama %s: decode stream chunk: %w", s.model.qualified(), err)
}
if chunk.Message.Content != "" {
s.text = append(s.text, chunk.Message.Content...)
s.pending = append(s.pending, llm.StreamEvent{TextDelta: chunk.Message.Content})
}
// Tool calls arrive complete per chunk (no partial-argument deltas
// in the native protocol).
base := len(s.toolCalls)
for i, tc := range chunk.Message.ToolCalls {
id := tc.ID
if id == "" {
id = "call_" + strconv.Itoa(base+i)
}
args := tc.Function.Arguments
if len(args) == 0 {
args = json.RawMessage("{}")
}
call := llm.ToolCall{ID: id, Name: tc.Function.Name, Arguments: args}
s.toolCalls = append(s.toolCalls, call)
s.pending = append(s.pending, llm.StreamEvent{ToolCall: &s.toolCalls[len(s.toolCalls)-1]})
}
if chunk.Done {
s.usage = llm.Usage{InputTokens: chunk.PromptEvalCount, OutputTokens: chunk.EvalCount}
s.doneReason = chunk.DoneReason
s.queueFinal()
}
}
}
// queueFinal appends the final Response event and marks the stream done.
func (s *stream) queueFinal() {
resp := &llm.Response{
Model: s.model.qualified(),
Usage: s.usage,
FinishReason: finishReason(s.doneReason, len(s.toolCalls) > 0),
}
if len(s.text) > 0 {
resp.Parts = append(resp.Parts, llm.Text(string(s.text)))
}
if len(s.toolCalls) > 0 {
resp.ToolCalls = append([]llm.ToolCall(nil), s.toolCalls...)
}
s.pending = append(s.pending, llm.StreamEvent{Response: resp})
s.finished = true
}
func (s *stream) Close() error {
s.mu.Lock()
defer s.mu.Unlock()
if s.closed {
return nil
}
s.closed = true
return s.body.Close()
}