proxy,ui-svelte: improve support for v1/messages and v1/responses (#758)

This improves the support for activity logging from the v1/responses and v1/messages endpoints. - add chat endpoint selection to Playground > Chat > Settings - improve metrics extraction for streaming v1/messages and v1/responses endpoints (tested with llama-server) Fixes #742
2026-05-14 21:53:57 -07:00
parent aac7b8745a
commit fe71e8a6ea
4 changed files with 537 additions and 119 deletions
@@ -424,110 +424,159 @@ func (mp *metricsMonitor) wrapHandler(
 	return nil
 }
 // usagePaths lists the JSON paths where a per-event usage object can live.
 // v1/chat/completions puts it at top-level "usage"; v1/responses nests under
 // "response.usage"; v1/messages emits it at "message.usage" on message_start
 // and at "usage" on message_delta.
 var usagePaths = []string{"usage", "response.usage", "message.usage"}
 // extractUsageTokens reads input/output/cached token counts from a usage
 // gjson.Result, handling the field-name differences across endpoints.
 // cached returns -1 when the field is absent. ok is true when at least one
 // field was present.
 func extractUsageTokens(usage gjson.Result) (input, output, cached int64, ok bool) {
 	cached = -1
 	if !usage.Exists() {
 		return
 	}
 	if v := usage.Get("prompt_tokens"); v.Exists() {
 		// v1/chat/completions
 		input = v.Int()
 		ok = true
 	} else if v := usage.Get("input_tokens"); v.Exists() {
 		// v1/messages, v1/responses
 		input = v.Int()
 		ok = true
 	}
 	if v := usage.Get("completion_tokens"); v.Exists() {
 		// v1/chat/completions
 		output = v.Int()
 		ok = true
 	} else if v := usage.Get("output_tokens"); v.Exists() {
 		// v1/messages, v1/responses
 		output = v.Int()
 		ok = true
 	}
 	if v := usage.Get("cache_read_input_tokens"); v.Exists() {
 		// v1/messages (Anthropic)
 		cached = v.Int()
 		ok = true
 	} else if v := usage.Get("input_tokens_details.cached_tokens"); v.Exists() {
 		// v1/responses (OpenAI Responses API)
 		cached = v.Int()
 		ok = true
 	} else if v := usage.Get("prompt_tokens_details.cached_tokens"); v.Exists() {
 		// v1/chat/completions (OpenAI cache hits)
 		cached = v.Int()
 		ok = true
 	}
 	return
 }
 func processStreamingResponse(modelID string, start time.Time, body []byte) (ActivityLogEntry, error) {
-	// Iterate **backwards** through the body looking for the data payload with
+	// Walk SSE "data:" lines forward, merging usage info from every event.
-	// usage data. This avoids allocating a slice of all lines via bytes.Split.
+	// Different endpoints split usage across events:
 	//   - v1/chat/completions: usage on the final chunk before [DONE]
 	//   - v1/responses:        usage on response.completed (response.usage)
 	//   - v1/messages:         input + cache on message_start (message.usage),
 	//                          output_tokens on message_delta (usage)
 	// We take the latest informative value per field so all three are covered.
-	// Start from the end of the body and scan backwards for newlines
+	var (
-	pos := len(body)
+		inputTokens, outputTokens int64
-	for pos > 0 {
+		cachedTokens              int64 = -1
-		// Find the previous newline (or start of body)
+		hasAny                    bool
-		lineStart := bytes.LastIndexByte(body[:pos], '\n')
+		timings                   gjson.Result
-		if lineStart == -1 {
+	)
 			lineStart = 0
 		} else {
 			lineStart++ // Move past the newline
 		}
 		line := bytes.TrimSpace(body[lineStart:pos])
 		pos = lineStart - 1 // Move position before the newline for next iteration
 		if len(line) == 0 {
 			continue
 		}
 		// SSE payload always follows "data:"
 	prefix := []byte("data:")
-		if !bytes.HasPrefix(line, prefix) {
+	for offset := 0; offset < len(body); {
 		nl := bytes.IndexByte(body[offset:], '\n')
 		var line []byte
 		if nl == -1 {
 			line = body[offset:]
 			offset = len(body)
 		} else {
 			line = body[offset : offset+nl]
 			offset += nl + 1
 		}
 		line = bytes.TrimSpace(line)
 		if len(line) == 0 || !bytes.HasPrefix(line, prefix) {
 			continue
 		}
 		data := bytes.TrimSpace(line[len(prefix):])
-
+		if len(data) == 0 || bytes.Equal(data, []byte("[DONE]")) {
 		if len(data) == 0 {
 			continue
 		}
-
+		if !gjson.ValidBytes(data) {
 		if bytes.Equal(data, []byte("[DONE]")) {
 			// [DONE] line itself contains nothing of interest.
 			continue
 		}
 		if gjson.ValidBytes(data) {
 		parsed := gjson.ParseBytes(data)
 			usage := parsed.Get("usage")
 			timings := parsed.Get("timings")
-			// v1/responses format nests usage under response.usage
+		for _, path := range usagePaths {
-			if !usage.Exists() {
+			u := parsed.Get(path)
-				usage = parsed.Get("response.usage")
+			if !u.Exists() {
 				continue
 			}
-
+			i, o, c, ok := extractUsageTokens(u)
-			if usage.Exists() || timings.Exists() {
+			if !ok {
-				return parseMetrics(modelID, start, usage, timings)
+				continue
 			}
 			hasAny = true
 			// Take the latest non-zero value so message_start's input_tokens
 			// is preserved when message_delta's usage omits it, and vice versa
 			// for output_tokens.
 			if i > 0 {
 				inputTokens = i
 			}
 			if o > 0 {
 				outputTokens = o
 			}
 			if c >= 0 {
 				cachedTokens = c
 			}
 		}
 		if t := parsed.Get("timings"); t.Exists() {
 			timings = t
 			hasAny = true
 		}
 	}
 	if !hasAny {
 		return ActivityLogEntry{}, fmt.Errorf("no valid JSON data found in stream")
 	}
 	return buildMetrics(modelID, start, inputTokens, outputTokens, cachedTokens, timings), nil
 }
 func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result) (ActivityLogEntry, error) {
 	input, output, cached, _ := extractUsageTokens(usage)
 	return buildMetrics(modelID, start, input, output, cached, timings), nil
 }
 // buildMetrics composes an ActivityLogEntry from accumulated token counts and
 // optional llama-server timings (which override input/output and provide rates).
 func buildMetrics(modelID string, start time.Time, inputTokens, outputTokens, cachedTokens int64, timings gjson.Result) ActivityLogEntry {
 	wallDurationMs := int(time.Since(start).Milliseconds())
-
+	durationMs := wallDurationMs
 	// default values
 	cachedTokens := -1 // unknown or missing data
 	outputTokens := 0
 	inputTokens := 0
 	// timings data
 	tokensPerSecond := -1.0
 	promptPerSecond := -1.0
 	durationMs := wallDurationMs
 	if usage.Exists() {
 		if pt := usage.Get("prompt_tokens"); pt.Exists() {
 			// v1/chat/completions
 			inputTokens = int(pt.Int())
 		} else if it := usage.Get("input_tokens"); it.Exists() {
 			// v1/messages
 			inputTokens = int(it.Int())
 		}
 		if ct := usage.Get("completion_tokens"); ct.Exists() {
 			// v1/chat/completions
 			outputTokens = int(ct.Int())
 		} else if ot := usage.Get("output_tokens"); ot.Exists() {
 			outputTokens = int(ot.Int())
 		}
 		if ct := usage.Get("cache_read_input_tokens"); ct.Exists() {
 			cachedTokens = int(ct.Int())
 		}
 	}
 	// use llama-server's timing data for tok/sec and duration as it is more accurate
 	if timings.Exists() {
-		inputTokens = int(timings.Get("prompt_n").Int())
+		inputTokens = timings.Get("prompt_n").Int()
-		outputTokens = int(timings.Get("predicted_n").Int())
+		outputTokens = timings.Get("predicted_n").Int()
 		promptPerSecond = timings.Get("prompt_per_second").Float()
 		tokensPerSecond = timings.Get("predicted_per_second").Float()
 		timingsDurationMs := int(timings.Get("prompt_ms").Float() + timings.Get("predicted_ms").Float())
 		if timingsDurationMs > durationMs {
 			durationMs = timingsDurationMs
 		}
 		if cachedValue := timings.Get("cache_n"); cachedValue.Exists() {
-			cachedTokens = int(cachedValue.Int())
+			cachedTokens = cachedValue.Int()
 		}
 	}
@@ -535,14 +584,14 @@ func parseMetrics(modelID string, start time.Time, usage, timings gjson.Result)
 		Timestamp: time.Now(),
 		Model:     modelID,
 		Tokens: TokenMetrics{
-			CachedTokens:    cachedTokens,
+			CachedTokens:    int(cachedTokens),
-			InputTokens:     inputTokens,
+			InputTokens:     int(inputTokens),
-			OutputTokens:    outputTokens,
+			OutputTokens:    int(outputTokens),
 			PromptPerSecond: promptPerSecond,
 			TokensPerSecond: tokensPerSecond,
 		},
 		DurationMs: durationMs,
-	}, nil
+	}
 }
 // decompressBody decompresses the body based on Content-Encoding header
@@ -777,6 +777,124 @@ data: [DONE]
 		assert.Equal(t, 23, metrics[0].Tokens.OutputTokens)
 	})
 	t.Run("v1/responses full stream with deltas, output, and cached tokens", func(t *testing.T) {
 		mm := newMetricsMonitor(testLogger, 10, 0)
 		// Realistic v1/responses stream: multiple delta events followed by
 		// done/completed events. Usage lives on response.completed and includes
 		// the OpenAI Responses cached-token shape (input_tokens_details.cached_tokens).
 		responseBody := "event: response.created\n" +
 			`data: {"type":"response.created","response":{"id":"resp_1","status":"in_progress"}}` + "\n\n" +
 			"event: response.output_item.added\n" +
 			`data: {"type":"response.output_item.added","item":{"id":"msg_1","role":"assistant","status":"in_progress","type":"message"}}` + "\n\n" +
 			"event: response.content_part.added\n" +
 			`data: {"type":"response.content_part.added","item_id":"msg_1","part":{"type":"output_text","text":""}}` + "\n\n" +
 			"event: response.output_text.delta\n" +
 			`data: {"type":"response.output_text.delta","item_id":"msg_1","delta":"Hello"}` + "\n\n" +
 			"event: response.output_text.delta\n" +
 			`data: {"type":"response.output_text.delta","item_id":"msg_1","delta":" world"}` + "\n\n" +
 			"event: response.output_text.done\n" +
 			`data: {"type":"response.output_text.done","item_id":"msg_1","text":"Hello world"}` + "\n\n" +
 			"event: response.content_part.done\n" +
 			`data: {"type":"response.content_part.done","item_id":"msg_1","part":{"type":"output_text","text":"Hello world"}}` + "\n\n" +
 			"event: response.output_item.done\n" +
 			`data: {"type":"response.output_item.done","item":{"type":"message","status":"completed","id":"msg_1","content":[{"type":"output_text","text":"Hello world"}],"role":"assistant"}}` + "\n\n" +
 			"event: response.completed\n" +
 			`data: {"type":"response.completed","response":{"id":"resp_1","object":"response","status":"completed","model":"test-model","output":[{"type":"message","status":"completed","id":"msg_1","content":[{"type":"output_text","text":"Hello world"}],"role":"assistant"}],"usage":{"input_tokens":14,"output_tokens":24,"total_tokens":38,"input_tokens_details":{"cached_tokens":13}}}}` + "\n\n"
 		nextHandler := func(modelID string, w http.ResponseWriter, r *http.Request) error {
 			w.Header().Set("Content-Type", "text/event-stream")
 			w.WriteHeader(http.StatusOK)
 			w.Write([]byte(responseBody))
 			return nil
 		}
 		req := httptest.NewRequest("POST", "/v1/responses", nil)
 		rec := httptest.NewRecorder()
 		ginCtx, _ := gin.CreateTestContext(rec)
 		err := mm.wrapHandler("test-model", ginCtx.Writer, req, captureAll, nextHandler)
 		assert.NoError(t, err)
 		metrics := mm.getMetrics()
 		assert.Equal(t, 1, len(metrics))
 		assert.Equal(t, "test-model", metrics[0].Model)
 		assert.Equal(t, 14, metrics[0].Tokens.InputTokens)
 		assert.Equal(t, 24, metrics[0].Tokens.OutputTokens)
 		assert.Equal(t, 13, metrics[0].Tokens.CachedTokens)
 	})
 	t.Run("v1/messages merges usage from message_start and message_delta", func(t *testing.T) {
 		mm := newMetricsMonitor(testLogger, 10, 0)
 		// v1/messages splits usage across two events:
 		//   message_start.message.usage has input_tokens + cache_read_input_tokens
 		//   message_delta.usage has the final output_tokens
 		// Without merging, output_tokens (last seen) would clobber the input fields.
 		responseBody := "event: message_start\n" +
 			`data: {"type":"message_start","message":{"id":"m1","type":"message","role":"assistant","content":[],"model":"test-model","usage":{"cache_read_input_tokens":5,"input_tokens":9,"output_tokens":0}}}` + "\n\n" +
 			"event: content_block_start\n" +
 			`data: {"type":"content_block_start","index":0,"content_block":{"type":"text","text":""}}` + "\n\n" +
 			"event: content_block_delta\n" +
 			`data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hi"}}` + "\n\n" +
 			"event: content_block_delta\n" +
 			`data: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" there"}}` + "\n\n" +
 			"event: content_block_stop\n" +
 			`data: {"type":"content_block_stop","index":0}` + "\n\n" +
 			"event: message_delta\n" +
 			`data: {"type":"message_delta","delta":{"stop_reason":"end_turn"},"usage":{"output_tokens":24}}` + "\n\n" +
 			"event: message_stop\n" +
 			`data: {"type":"message_stop"}` + "\n\n"
 		nextHandler := func(modelID string, w http.ResponseWriter, r *http.Request) error {
 			w.Header().Set("Content-Type", "text/event-stream")
 			w.WriteHeader(http.StatusOK)
 			w.Write([]byte(responseBody))
 			return nil
 		}
 		req := httptest.NewRequest("POST", "/v1/messages", nil)
 		rec := httptest.NewRecorder()
 		ginCtx, _ := gin.CreateTestContext(rec)
 		err := mm.wrapHandler("test-model", ginCtx.Writer, req, captureAll, nextHandler)
 		assert.NoError(t, err)
 		metrics := mm.getMetrics()
 		assert.Equal(t, 1, len(metrics))
 		assert.Equal(t, 9, metrics[0].Tokens.InputTokens)
 		assert.Equal(t, 24, metrics[0].Tokens.OutputTokens)
 		assert.Equal(t, 5, metrics[0].Tokens.CachedTokens)
 	})
 	t.Run("v1/chat/completions OpenAI prompt_tokens_details.cached_tokens", func(t *testing.T) {
 		mm := newMetricsMonitor(testLogger, 10, 0)
 		responseBody := `data: {"choices":[{"delta":{"content":"hi"}}]}` + "\n\n" +
 			`data: {"choices":[{"delta":{}}],"usage":{"prompt_tokens":50,"completion_tokens":12,"prompt_tokens_details":{"cached_tokens":42}}}` + "\n\n" +
 			"data: [DONE]\n\n"
 		nextHandler := func(modelID string, w http.ResponseWriter, r *http.Request) error {
 			w.Header().Set("Content-Type", "text/event-stream")
 			w.WriteHeader(http.StatusOK)
 			w.Write([]byte(responseBody))
 			return nil
 		}
 		req := httptest.NewRequest("POST", "/v1/chat/completions", nil)
 		rec := httptest.NewRecorder()
 		ginCtx, _ := gin.CreateTestContext(rec)
 		err := mm.wrapHandler("test-model", ginCtx.Writer, req, captureAll, nextHandler)
 		assert.NoError(t, err)
 		metrics := mm.getMetrics()
 		assert.Equal(t, 1, len(metrics))
 		assert.Equal(t, 50, metrics[0].Tokens.InputTokens)
 		assert.Equal(t, 12, metrics[0].Tokens.OutputTokens)
 		assert.Equal(t, 42, metrics[0].Tokens.CachedTokens)
 	})
 	t.Run("handles empty streaming response records minimal metrics", func(t *testing.T) {
 		mm := newMetricsMonitor(testLogger, 10, 0)
@@ -1,7 +1,7 @@
 <script lang="ts">
  import { models } from "../../stores/api";
  import { persistentStore } from "../../stores/persistent";
-  import { streamChatCompletion } from "../../lib/chatApi";
+  import { streamChatCompletion, type Endpoint } from "../../lib/chatApi";
  import { playgroundStores } from "../../stores/playgroundActivity";
  import type { ChatMessage, ContentPart } from "../../lib/types";
  import ChatMessageComponent from "./ChatMessage.svelte";
@@ -11,6 +11,8 @@
  const selectedModelStore = persistentStore<string>("playground-selected-model", "");
  const systemPromptStore = persistentStore<string>("playground-system-prompt", "");
  const temperatureStore = persistentStore<number>("playground-temperature", 0.7);
  const endpointStore = persistentStore<Endpoint>("playground-endpoint", "v1/chat/completions");
  const maxTokensStore = persistentStore<number>("playground-max-tokens", 4096);
  function loadMessages(): ChatMessage[] {
    try {
@@ -142,7 +144,7 @@
        $selectedModelStore,
        apiMessages,
        abortController.signal,
-        { temperature: $temperatureStore }
+        { temperature: $temperatureStore, endpoint: $endpointStore, max_tokens: $maxTokensStore }
      );
      for await (const chunk of stream) {
@@ -319,6 +321,19 @@
  <!-- Settings panel -->
  {#if showSettings}
    <div class="shrink-0 mb-4 p-4 bg-surface border border-gray-200 dark:border-white/10 rounded">
      <div class="mb-4">
        <label class="block text-sm font-medium mb-1" for="endpoint">Endpoint</label>
        <select
          id="endpoint"
          class="w-full px-3 py-2 rounded border border-gray-200 dark:border-white/10 bg-card focus:outline-none focus:ring-2 focus:ring-primary"
          bind:value={$endpointStore}
          disabled={isStreaming}
        >
          <option value="v1/chat/completions">/v1/chat/completions</option>
          <option value="v1/messages">/v1/messages</option>
          <option value="v1/responses">/v1/responses</option>
        </select>
      </div>
      <div class="mb-4">
        <label class="block text-sm font-medium mb-1" for="system-prompt">System Prompt</label>
        <textarea
@@ -330,7 +345,7 @@
          disabled={isStreaming}
        ></textarea>
      </div>
-      <div>
+      <div class="mb-4">
        <label class="block text-sm font-medium mb-1" for="temperature">
          Temperature: {$temperatureStore.toFixed(2)}
        </label>
@@ -349,6 +364,18 @@
          <span>Creative (2)</span>
        </div>
      </div>
      <div>
        <label class="block text-sm font-medium mb-1" for="max-tokens">Max Tokens</label>
        <input
          id="max-tokens"
          type="number"
          min="1"
          class="w-full px-3 py-2 rounded border border-gray-200 dark:border-white/10 bg-card focus:outline-none focus:ring-2 focus:ring-primary"
          bind:value={$maxTokensStore}
          disabled={isStreaming}
        />
        <p class="text-xs text-txtsecondary mt-1">Required for /v1/messages.</p>
      </div>
    </div>
  {/if}
@@ -1,4 +1,6 @@
-import type { ChatMessage, ChatCompletionRequest } from "./types";
+import type { ChatMessage, ContentPart } from "./types";
 export type Endpoint = "v1/chat/completions" | "v1/messages" | "v1/responses";
 export interface StreamChunk {
  content: string;
@@ -8,9 +10,126 @@ export interface StreamChunk {
 export interface ChatOptions {
  temperature?: number;
  endpoint?: Endpoint;
  max_tokens?: number;
 }
-function parseSSELine(line: string): StreamChunk | null {
+function parseDataUrl(url: string): { media_type: string; data: string } {
  const match = /^data:([^;]+);base64,(.*)$/i.exec(url);
  if (!match) {
    throw new Error("Image is not a base64 data URL");
  }
  return { media_type: match[1], data: match[2] };
 }
 function splitSystemMessages(messages: ChatMessage[]): { system: string; rest: ChatMessage[] } {
  const systemParts: string[] = [];
  const rest: ChatMessage[] = [];
  for (const msg of messages) {
    if (msg.role === "system") {
      if (typeof msg.content === "string") {
        systemParts.push(msg.content);
      } else {
        for (const part of msg.content) {
          if (part.type === "text") systemParts.push(part.text);
        }
      }
    } else {
      rest.push(msg);
    }
  }
  return { system: systemParts.join("\n\n"), rest };
 }
 function buildChatCompletionsBody(model: string, messages: ChatMessage[], options?: ChatOptions): object {
  return {
    model,
    messages: messages.map((m) => ({
      role: m.role,
      content: m.content,
    })),
    stream: true,
    temperature: options?.temperature,
    ...(options?.max_tokens ? { max_tokens: options.max_tokens } : {}),
  };
 }
 function buildMessagesBody(model: string, messages: ChatMessage[], options?: ChatOptions): object {
  const { system, rest } = splitSystemMessages(messages);
  const mapped = rest.map((m) => {
    if (typeof m.content === "string") {
      return { role: m.role, content: m.content };
    }
    const blocks: object[] = [];
    for (const part of m.content as ContentPart[]) {
      if (part.type === "text") {
        blocks.push({ type: "text", text: part.text });
      } else if (m.role !== "assistant") {
        const { media_type, data } = parseDataUrl(part.image_url.url);
        blocks.push({ type: "image", source: { type: "base64", media_type, data } });
      }
    }
    return { role: m.role, content: blocks };
  });
  const body: Record<string, unknown> = {
    model,
    messages: mapped,
    stream: true,
    max_tokens: options?.max_tokens ?? 4096,
  };
  if (system) body.system = system;
  if (options?.temperature !== undefined) body.temperature = options.temperature;
  return body;
 }
 function buildResponsesBody(model: string, messages: ChatMessage[], options?: ChatOptions): object {
  const { system, rest } = splitSystemMessages(messages);
  const input = rest.map((m) => {
    const isAssistant = m.role === "assistant";
    if (typeof m.content === "string") {
      const partType = isAssistant ? "output_text" : "input_text";
      return { role: m.role, content: [{ type: partType, text: m.content }] };
    }
    const content = m.content.map((part: ContentPart) => {
      if (part.type === "text") {
        return { type: isAssistant ? "output_text" : "input_text", text: part.text };
      }
      return { type: "input_image", image_url: part.image_url.url };
    });
    return { role: m.role, content };
  });
  const body: Record<string, unknown> = {
    model,
    input,
    stream: true,
  };
  if (system) body.instructions = system;
  if (options?.temperature !== undefined) body.temperature = options.temperature;
  if (options?.max_tokens) body.max_output_tokens = options.max_tokens;
  return body;
 }
 function buildRequest(
  endpoint: Endpoint,
  model: string,
  messages: ChatMessage[],
  options?: ChatOptions
 ): { url: string; body: object } {
  const url = "/" + endpoint;
  switch (endpoint) {
    case "v1/messages":
      return { url, body: buildMessagesBody(model, messages, options) };
    case "v1/responses":
      return { url, body: buildResponsesBody(model, messages, options) };
    case "v1/chat/completions":
    default:
      return { url, body: buildChatCompletionsBody(model, messages, options) };
  }
 }
 function parseChatCompletionsLine(line: string): StreamChunk | null {
  const trimmed = line.trim();
  if (!trimmed || !trimmed.startsWith("data: ")) {
    return null;
@@ -36,25 +155,158 @@ function parseSSELine(line: string): StreamChunk | null {
  }
 }
 async function* parseChatCompletionsStream(
  reader: ReadableStreamDefaultReader<Uint8Array>
 ): AsyncGenerator<StreamChunk> {
  const decoder = new TextDecoder();
  let buffer = "";
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    buffer += decoder.decode(value, { stream: true });
    const lines = buffer.split("\n");
    buffer = lines.pop() || "";
    for (const line of lines) {
      const result = parseChatCompletionsLine(line);
      if (result?.done) {
        yield result;
        return;
      }
      if (result) {
        yield result;
      }
    }
  }
  const result = parseChatCompletionsLine(buffer);
  if (result && !result.done) {
    yield result;
  }
 }
 function parseSSEEventBlock(block: string): { event: string; data: string } | null {
  let event = "";
  const dataLines: string[] = [];
  for (const rawLine of block.split("\n")) {
    const line = rawLine.replace(/\r$/, "");
    if (!line || line.startsWith(":")) continue;
    if (line.startsWith("event:")) {
      event = line.slice(6).trim();
    } else if (line.startsWith("data:")) {
      dataLines.push(line.slice(5).trim());
    }
  }
  if (dataLines.length === 0 && !event) return null;
  return { event, data: dataLines.join("\n") };
 }
 async function* parseMessagesStream(
  reader: ReadableStreamDefaultReader<Uint8Array>
 ): AsyncGenerator<StreamChunk> {
  const decoder = new TextDecoder();
  let buffer = "";
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    buffer += decoder.decode(value, { stream: true });
    const blocks = buffer.split("\n\n");
    buffer = blocks.pop() || "";
    for (const block of blocks) {
      const parsed = parseSSEEventBlock(block);
      if (!parsed) continue;
      if (parsed.event === "message_stop") {
        yield { content: "", done: true };
        return;
      }
      if (parsed.event !== "content_block_delta" || !parsed.data) continue;
      try {
        const json = JSON.parse(parsed.data);
        const delta = json.delta;
        if (!delta) continue;
        if (delta.type === "text_delta" && delta.text) {
          yield { content: delta.text, done: false };
        } else if (delta.type === "thinking_delta" && delta.thinking) {
          yield { content: "", reasoning_content: delta.thinking, done: false };
        }
      } catch {
        // ignore malformed event
      }
    }
  }
 }
 async function* parseResponsesStream(
  reader: ReadableStreamDefaultReader<Uint8Array>
 ): AsyncGenerator<StreamChunk> {
  const decoder = new TextDecoder();
  let buffer = "";
  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    buffer += decoder.decode(value, { stream: true });
    const blocks = buffer.split("\n\n");
    buffer = blocks.pop() || "";
    for (const block of blocks) {
      const parsed = parseSSEEventBlock(block);
      if (!parsed) continue;
      if (parsed.event === "response.completed") {
        yield { content: "", done: true };
        return;
      }
      if (!parsed.data) continue;
      try {
        const json = JSON.parse(parsed.data);
        if (parsed.event === "response.output_text.delta" && json.delta) {
          yield { content: json.delta, done: false };
        } else if (parsed.event === "response.reasoning_summary_text.delta" && json.delta) {
          yield { content: "", reasoning_content: json.delta, done: false };
        }
      } catch {
        // ignore malformed event
      }
    }
  }
 }
 function parseStream(
  endpoint: Endpoint,
  reader: ReadableStreamDefaultReader<Uint8Array>
 ): AsyncGenerator<StreamChunk> {
  switch (endpoint) {
    case "v1/messages":
      return parseMessagesStream(reader);
    case "v1/responses":
      return parseResponsesStream(reader);
    case "v1/chat/completions":
    default:
      return parseChatCompletionsStream(reader);
  }
 }
 export async function* streamChatCompletion(
  model: string,
  messages: ChatMessage[],
  signal?: AbortSignal,
  options?: ChatOptions
 ): AsyncGenerator<StreamChunk> {
-  const request: ChatCompletionRequest = {
+  const endpoint = options?.endpoint ?? "v1/chat/completions";
-    model,
+  const { url, body } = buildRequest(endpoint, model, messages, options);
    messages,
    stream: true,
    temperature: options?.temperature,
  };
-  const response = await fetch("/v1/chat/completions", {
+  const response = await fetch(url, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
    },
-    body: JSON.stringify(request),
+    body: JSON.stringify(body),
    signal,
  });
@@ -68,39 +320,11 @@ export async function* streamChatCompletion(
    throw new Error("Response body is not readable");
  }
  const decoder = new TextDecoder();
  let buffer = "";
  try {
-    while (true) {
+    for await (const chunk of parseStream(endpoint, reader)) {
-      const { done, value } = await reader.read();
+      yield chunk;
-
+      if (chunk.done) return;
      if (done) {
        break;
    }
      buffer += decoder.decode(value, { stream: true });
      const lines = buffer.split("\n");
      buffer = lines.pop() || "";
      for (const line of lines) {
        const result = parseSSELine(line);
        if (result?.done) {
          yield result;
          return;
        }
        if (result) {
          yield result;
        }
      }
    }
    // Process any remaining buffer
    const result = parseSSELine(buffer);
    if (result && !result.done) {
      yield result;
    }
    yield { content: "", done: true };
  } finally {
    reader.releaseLock();