Add /upstream endpoint (#30 )

* remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico
fix bad logging due to wrong []byte used #28
2024-12-17 14:37:44 -08:00 · 2024-12-16 16:22:14 -08:00 · 2024-12-16 12:30:31 -08:00 · 2024-12-16 12:29:25 -08:00 · 2024-12-14 15:46:10 -08:00
12 changed files with 102 additions and 41 deletions
@@ -3,3 +3,4 @@
 build/
 dist/
 .vscode
+.DS_Store
@@ -9,9 +9,6 @@ ifneq ($(shell git status --porcelain),)
    GIT_HASH := $(GIT_HASH)+
 endif

-# Get the build number from the commit count on the main branch
-COMMIT_COUNT := $(shell git rev-list --count HEAD)
-
 # Capture the current build date in RFC3339 format
 BUILD_DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ")

@@ -31,12 +28,12 @@ test-all:
 # Build OSX binary
 mac:
 	@echo "Building Mac binary..."
-	GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
+	GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64

 # Build Linux binary
 linux:
 	@echo "Building Linux binary..."
-	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
+	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64

 # for testing proxy.Process
 simple-responder:
@@ -55,9 +52,12 @@ release:
 		echo "Error: There are unstaged changes. Please commit or stash your changes before creating a release tag." >&2; \
 		exit 1; \
 	fi
-	@echo "Creating release tag v$(COMMIT_COUNT)..."
-	git tag v$(COMMIT_COUNT)
-	git push origin v$(COMMIT_COUNT)
+
+# Get the highest tag in v{number} format, increment it, and create a new tag
+	@highest_tag=$$(git tag --sort=-v:refname | grep -E '^v[0-9]+$$' | head -n 1 || echo "v0"); \
+	new_tag="v$$(( $${highest_tag#v} + 1 ))"; \
+	echo "tagging new version: $$new_tag"; \
+	git tag "$$new_tag";

 # Phony targets
 .PHONY: all clean osx linux
@@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
 Features:

 - ✅ Easy to deploy: single binary with no dependencies
- ✅ Single yaml configuration file
+- ✅ Easy to config: single yaml file
 - ✅ On-demand model switching
 - ✅ Full control over server settings per model
 - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
@@ -16,7 +16,8 @@ Features:
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`

 ## Releases

@@ -73,6 +74,12 @@ models:
      --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
    proxy: http://127.0.0.1:8999

+  # unlisted models do not show up in /v1/models or /upstream lists
+  # but they can still be requested as normal
+  "qwen-unlisted":
+    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
+    unlisted: true
+
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
@@ -33,6 +33,7 @@ models:
      - env1=hello
    cmd: build/simple-responder --port 8999
    proxy: http://127.0.0.1:8999
+    unlisted: true

    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
@@ -42,9 +43,11 @@ models:
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
+    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
+    unlisted: true

 # creating a coding profile with models for code generation and general questions
 profiles:
@@ -22,7 +22,7 @@ func main() {
 	flag.Parse() // Parse the command-line flags

 	if *showVersion {
-		fmt.Printf("version: v%s (%s), built at %s\n", version, commit, date)
+		fmt.Printf("version: %s (%s), built at %s\n", version, commit, date)
 		os.Exit(0)
 	}

@@ -16,6 +16,7 @@ type ModelConfig struct {
 	Env           []string `yaml:"env"`
 	CheckEndpoint string   `yaml:"checkEndpoint"`
 	UnloadAfter   int      `yaml:"ttl"`
+	Unlisted      bool     `yaml:"unlisted"`
 }

 func (m *ModelConfig) SanitizedCommand() ([]string, error) {
@@ -46,7 +46,7 @@ func (w *LogMonitor) Write(p []byte) (n int, err error) {
 	w.buffer = w.buffer.Next()
 	w.bufferMu.Unlock()

-	w.broadcast(p)
+	w.broadcast(bufferCopy)
 	return n, nil
 }

@@ -125,11 +125,15 @@ func (p *Process) start() error {
 			maxDuration := time.Duration(p.config.UnloadAfter) * time.Second

 			for range time.Tick(time.Second) {
+				if p.state != StateReady {
+					return
+				}
+
 				// wait for all inflight requests to complete and ticker
 				p.inFlightRequests.Wait()

 				if time.Since(p.lastRequestHandled) > maxDuration {
-					fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %d reached.\n", p.ID, p.config.UnloadAfter)
+					fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
 					p.Stop()
 				}
 			}
@@ -162,25 +166,25 @@ func (p *Process) Stop() {
 	// will be a source of pain in the future.

 	p.cmd.Process.Signal(syscall.SIGTERM)
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()

-	done := make(chan error, 1)
+	sigtermNormal := make(chan error, 1)
 	go func() {
-		done <- p.cmd.Wait()
+		sigtermNormal <- p.cmd.Wait()
 	}()

 	select {
-	case <-ctx.Done():
-		fmt.Printf("!!! process for %s timed out waiting to stop\n", p.ID)
+	case <-sigtermTimeout.Done():
+		fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
 		p.cmd.Process.Kill()
 		p.cmd.Wait()
-	case err := <-done:
+	case err := <-sigtermNormal:
 		if err != nil {
 			if err.Error() != "wait: no child processes" {
 				// possible that simple-responder for testing is just not
 				// existing right, so suppress those errors.
-				fmt.Printf("!!! process for %s stopped with error > %v\n", p.ID, err)
+				fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
 			}
 		}
 	}
@@ -2,10 +2,12 @@ package proxy

 import (
 	"bytes"
+	"embed"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -18,6 +20,15 @@ const (
 	PROFILE_SPLIT_CHAR = ":"
 )

+//go:embed html/favicon.ico
+var faviconData []byte
+
+//go:embed html/logs.html
+var logsHTML []byte
+
+// make sure embed is kept there by the IDE auto-package importer
+var _ = embed.FS{}
+
 type ProxyManager struct {
 	sync.Mutex

@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
 	pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
 	pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)

-	pm.ginEngine.NoRoute(pm.proxyNoRouteHandler)
+	pm.ginEngine.GET("/upstream", pm.upstreamIndex)
+	pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
+
+	pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
+		c.Data(http.StatusOK, "image/x-icon", faviconData)
+	})

 	// Disable console color for testing
 	gin.DisableConsoleColor()
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {

 func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
 	data := []interface{}{}
-	for id := range pm.config.Models {
+	for id, modelConfig := range pm.config.Models {
+		if modelConfig.Unlisted {
+			continue
+		}
+
 		data = append(data, map[string]interface{}{
 			"id":       id,
 			"object":   "model",
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	pm.Lock()
 	defer pm.Unlock()

-	// Check if requestedModel contains a /
+	// Check if requestedModel contains a PROFILE_SPLIT_CHAR
 	profileName, modelName := "", requestedModel
 	if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
 		profileName = requestedModel[:idx]
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	return pm.currentProcesses[requestedProcessKey], nil
 }

+func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
+	requestedModel := c.Param("model_id")
+
+	if requestedModel == "" {
+		c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
+		return
+	}
+
+	if process, err := pm.swapModel(requestedModel); err != nil {
+		c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
+	} else {
+		// rewrite the path
+		c.Request.URL.Path = c.Param("upstreamPath")
+		process.ProxyRequest(c.Writer, c.Request)
+	}
+}
+
+func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
+	var html strings.Builder
+
+	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
+
+	// Extract keys and sort them
+	var modelIDs []string
+	for modelID, modelConfig := range pm.config.Models {
+		if modelConfig.Unlisted {
+			continue
+		}
+
+		modelIDs = append(modelIDs, modelID)
+	}
+	sort.Strings(modelIDs)
+
+	// Iterate over sorted keys
+	for _, modelID := range modelIDs {
+		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
+	}
+	html.WriteString("</ul></body></html>")
+	c.Header("Content-Type", "text/html")
+	c.String(http.StatusOK, html.String())
+}
+
 func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	bodyBytes, err := io.ReadAll(c.Request.Body)
 	if err != nil {
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	}
 }

-func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
-	// since maps are unordered, just use the first available process if one exists
-	for _, process := range pm.currentProcesses {
-		process.ProxyRequest(c.Writer, c.Request)
-		return
-	}
-
-	c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
-}
-
 func ProcessKeyName(groupName, modelName string) string {
 	return groupName + PROFILE_SPLIT_CHAR + modelName
 }
@@ -1,7 +1,6 @@
 package proxy

 import (
-	"embed"
 	"fmt"
 	"net/http"
 	"strings"
@@ -9,12 +8,6 @@ import (
 	"github.com/gin-gonic/gin"
 )

-//go:embed html/logs.html
-var logsHTML []byte
-
-// make sure embed is kept there by the IDE auto-package importer
-var _ = embed.FS{}
-
 func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {

 	accept := c.GetHeader("Accept")
Author	SHA1	Message	Date
Benson Wong	891f6a5b5a	Add /upstream endpoint (#30 ) * remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico	2024-12-17 14:37:44 -08:00
Benson Wong	7183f6b43d	fix bad logging due to wrong []byte used #28	2024-12-16 16:22:14 -08:00
Benson Wong	d89bfeb441	add .DS_Store to .gitignore	2024-12-16 12:30:31 -08:00
Benson Wong	9a0c6bed40	Improve stop exceptions (#28 ) (#29 ) Stop Process TTL goroutine when process is not ready (#28) - fix issue where the goroutine will continue even though the child process is no longer running and the Process' state is not Ready - fix issue where some logs were going to stdout instead of p.logMonitor causing them to not show up in the /logs - add units to unloading model message	2024-12-16 12:29:25 -08:00
Benson Wong	d6ca535939	tweak release tagging so it is not based on number of commits	2024-12-14 15:46:10 -08:00