proxy: add support for anthropic v1/messages api (#417)

* proxy: add support for anthropic v1/messages api
* proxy: restrict loading message to /v1/chat/completions
This commit is contained in:
Benson Wong
2025-11-29 22:09:07 -08:00
committed by GitHub
parent a883d68d4f
commit c968da1b73
2 changed files with 17 additions and 12 deletions
+4 -1
View File
@@ -507,7 +507,10 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
// add a sync so the streaming client only runs when the goroutine has exited // add a sync so the streaming client only runs when the goroutine has exited
isStreaming, _ := r.Context().Value(proxyCtxKey("streaming")).(bool) isStreaming, _ := r.Context().Value(proxyCtxKey("streaming")).(bool)
if p.config.SendLoadingState != nil && *p.config.SendLoadingState && isStreaming {
// PR #417 (no support for anthropic v1/messages yet)
isChatCompletions := strings.HasPrefix(r.URL.Path, "/v1/chat/completions")
if p.config.SendLoadingState != nil && *p.config.SendLoadingState && isStreaming && isChatCompletions {
srw = newStatusResponseWriter(p, w) srw = newStatusResponseWriter(p, w)
go srw.statusUpdates(swapCtx) go srw.statusUpdates(swapCtx)
} else { } else {
+13 -11
View File
@@ -236,27 +236,29 @@ func (pm *ProxyManager) setupGinEngine() {
}) })
// Set up routes using the Gin engine // Set up routes using the Gin engine
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/chat/completions", pm.proxyInferenceHandler)
// Support legacy /v1/completions api, see issue #12 // Support legacy /v1/completions api, see issue #12
pm.ginEngine.POST("/v1/completions", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/completions", pm.proxyInferenceHandler)
// Support anthropic /v1/messages (added https://github.com/ggml-org/llama.cpp/pull/17570)
pm.ginEngine.POST("/v1/messages", pm.proxyInferenceHandler)
// Support embeddings and reranking // Support embeddings and reranking
pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/embeddings", pm.proxyInferenceHandler)
// llama-server's /reranking endpoint + aliases // llama-server's /reranking endpoint + aliases
pm.ginEngine.POST("/reranking", pm.proxyOAIHandler) pm.ginEngine.POST("/reranking", pm.proxyInferenceHandler)
pm.ginEngine.POST("/rerank", pm.proxyOAIHandler) pm.ginEngine.POST("/rerank", pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/rerank", pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/reranking", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/reranking", pm.proxyInferenceHandler)
// llama-server's /infill endpoint for code infilling // llama-server's /infill endpoint for code infilling
pm.ginEngine.POST("/infill", pm.proxyOAIHandler) pm.ginEngine.POST("/infill", pm.proxyInferenceHandler)
// llama-server's /completion endpoint // llama-server's /completion endpoint
pm.ginEngine.POST("/completion", pm.proxyOAIHandler) pm.ginEngine.POST("/completion", pm.proxyInferenceHandler)
// Support audio/speech endpoint // Support audio/speech endpoint
pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler) pm.ginEngine.POST("/v1/audio/speech", pm.proxyInferenceHandler)
pm.ginEngine.POST("/v1/audio/transcriptions", pm.proxyOAIPostFormHandler) pm.ginEngine.POST("/v1/audio/transcriptions", pm.proxyOAIPostFormHandler)
pm.ginEngine.GET("/v1/models", pm.listModelsHandler) pm.ginEngine.GET("/v1/models", pm.listModelsHandler)
@@ -545,7 +547,7 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
} }
} }
func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) { func (pm *ProxyManager) proxyInferenceHandler(c *gin.Context) {
bodyBytes, err := io.ReadAll(c.Request.Body) bodyBytes, err := io.ReadAll(c.Request.Body)
if err != nil { if err != nil {
pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body") pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")