feat(v2/ollama): implement native Stream() with NDJSON parsing

Reads Ollama's NDJSON stream (one JSON object per line) and emits
provider.StreamEvent values for text, thinking, tool-call start/delta/end,
and a final Done event carrying assembled Response and Usage. Uses
bufio.Scanner with a 4 MiB max-line buffer so multi-KB tool-call deltas
parse cleanly, and accepts tool-call arguments delivered either as
escaped string fragments (delta-style) or a complete JSON object
(one-shot).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-01 18:29:04 +00:00
parent 583f8724b2
commit f70c7c0842
2 changed files with 427 additions and 2 deletions
+201 -2
View File
@@ -5,6 +5,7 @@
package ollama
import (
"bufio"
"bytes"
"context"
"encoding/base64"
@@ -168,9 +169,207 @@ func (p *Provider) Complete(ctx context.Context, req provider.Request) (provider
}
// Stream performs a streaming chat completion via /api/chat with
// `stream: true`, parsing NDJSON line-by-line.
// `stream: true`, parsing NDJSON line-by-line. Tool-call argument deltas are
// accumulated across chunks keyed by id (or function index) and finalized
// when the upstream Done flag arrives.
func (p *Provider) Stream(ctx context.Context, req provider.Request, events chan<- provider.StreamEvent) error {
return fmt.Errorf("ollama native provider: Stream not implemented")
defer close(events)
body, err := p.buildChatRequest(req, true)
if err != nil {
return err
}
httpResp, err := p.doChatRequest(ctx, body)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
b, _ := io.ReadAll(httpResp.Body)
return fmt.Errorf("ollama: HTTP %d: %s", httpResp.StatusCode, string(b))
}
scanner := bufio.NewScanner(httpResp.Body)
// Ollama can emit multi-KB lines on tool-call deltas. Generous buffer.
const maxLineSize = 4 * 1024 * 1024
scanner.Buffer(make([]byte, 0, 64*1024), maxLineSize)
type toolAcc struct {
id string
name string
args strings.Builder
index int // ToolIndex emitted on stream events
}
tools := map[string]*toolAcc{}
var toolOrder []*toolAcc
var (
fullText strings.Builder
fullThinking strings.Builder
usage *provider.Usage
streamErr error
)
for scanner.Scan() {
line := scanner.Bytes()
if len(bytes.TrimSpace(line)) == 0 {
continue
}
var chunk nativeChatResponse
if err := json.Unmarshal(line, &chunk); err != nil {
streamErr = fmt.Errorf("ollama: decode stream chunk: %w", err)
break
}
if chunk.Message.Thinking != "" {
fullThinking.WriteString(chunk.Message.Thinking)
events <- provider.StreamEvent{
Type: provider.StreamEventThinking,
Text: chunk.Message.Thinking,
}
}
if chunk.Message.Content != "" {
fullText.WriteString(chunk.Message.Content)
events <- provider.StreamEvent{
Type: provider.StreamEventText,
Text: chunk.Message.Content,
}
}
for pos, tc := range chunk.Message.ToolCalls {
key := streamToolKey(tc, pos)
acc, exists := tools[key]
if !exists {
acc = &toolAcc{
id: tc.ID,
name: tc.Function.Name,
index: len(toolOrder),
}
if acc.id == "" {
acc.id = fmt.Sprintf("tc_%d", acc.index)
}
tools[key] = acc
toolOrder = append(toolOrder, acc)
events <- provider.StreamEvent{
Type: provider.StreamEventToolStart,
ToolIndex: acc.index,
ToolCall: &provider.ToolCall{
ID: acc.id,
Name: acc.name,
},
}
} else {
// Continuation chunk may carry the tool's name late; capture it.
if tc.Function.Name != "" && acc.name == "" {
acc.name = tc.Function.Name
}
}
delta := decodeArgumentDelta(tc.Function.Arguments)
if delta != "" {
acc.args.WriteString(delta)
events <- provider.StreamEvent{
Type: provider.StreamEventToolDelta,
ToolIndex: acc.index,
ToolCall: &provider.ToolCall{
Arguments: delta,
},
}
}
}
if chunk.Done {
if chunk.PromptEvalCount > 0 || chunk.EvalCount > 0 {
usage = &provider.Usage{
InputTokens: chunk.PromptEvalCount,
OutputTokens: chunk.EvalCount,
TotalTokens: chunk.PromptEvalCount + chunk.EvalCount,
}
}
break
}
}
if err := scanner.Err(); err != nil && streamErr == nil {
streamErr = fmt.Errorf("ollama: stream read: %w", err)
}
if streamErr != nil {
events <- provider.StreamEvent{
Type: provider.StreamEventError,
Error: streamErr,
}
return streamErr
}
// Finalize accumulated tool calls.
finalCalls := make([]provider.ToolCall, 0, len(toolOrder))
for _, acc := range toolOrder {
args := acc.args.String()
if args == "" {
args = "{}"
}
final := provider.ToolCall{
ID: acc.id,
Name: acc.name,
Arguments: args,
}
finalCalls = append(finalCalls, final)
events <- provider.StreamEvent{
Type: provider.StreamEventToolEnd,
ToolIndex: acc.index,
ToolCall: &final,
}
}
events <- provider.StreamEvent{
Type: provider.StreamEventDone,
Response: &provider.Response{
Text: fullText.String(),
Thinking: fullThinking.String(),
ToolCalls: finalCalls,
Usage: usage,
},
}
return nil
}
// streamToolKey computes a stable map key correlating tool-call deltas
// across stream chunks. Prefer the wire id, fall back to function index,
// finally fall back to the tool's position in the chunk's tool_calls array
// (a single-tool stream collapses cleanly under any strategy).
func streamToolKey(tc nativeToolCall, position int) string {
if tc.ID != "" {
return "id:" + tc.ID
}
if tc.Function.Index != nil {
return fmt.Sprintf("idx:%d", *tc.Function.Index)
}
return fmt.Sprintf("pos:%d", position)
}
// decodeArgumentDelta returns the string fragment to append when a streamed
// tool-call chunk includes arguments. Ollama may emit arguments either as a
// JSON-encoded string fragment (chunk-by-chunk concatenation, openaicompat
// style) or as a complete object value (one-shot delivery). We accept both:
// strings are unwrapped, objects/arrays pass through verbatim.
func decodeArgumentDelta(raw json.RawMessage) string {
if len(raw) == 0 {
return ""
}
trimmed := bytes.TrimSpace(raw)
if len(trimmed) == 0 || string(trimmed) == "null" {
return ""
}
if trimmed[0] == '"' {
var s string
if err := json.Unmarshal(trimmed, &s); err == nil {
return s
}
}
return string(trimmed)
}
// buildChatRequest converts a provider.Request into the native wire body