Add /upstream endpoint (#30 )

* remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico
fix bad logging due to wrong []byte used #28
2024-12-17 14:37:44 -08:00 · 2024-12-16 16:22:14 -08:00 · 2024-12-16 12:30:31 -08:00 · 2024-12-16 12:29:25 -08:00 · 2024-12-14 15:46:10 -08:00
12 changed files with 102 additions and 41 deletions
@@ -2,4 +2,5 @@
 .env
 build/
 dist/
-.vscode
+.vscode
 .DS_Store
@@ -9,9 +9,6 @@ ifneq ($(shell git status --porcelain),)
    GIT_HASH := $(GIT_HASH)+
 endif
 # Get the build number from the commit count on the main branch
 COMMIT_COUNT := $(shell git rev-list --count HEAD)
 # Capture the current build date in RFC3339 format
 BUILD_DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
@@ -31,12 +28,12 @@ test-all:
 # Build OSX binary
 mac:
 	@echo "Building Mac binary..."
-	GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
+	GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
 # Build Linux binary
 linux:
 	@echo "Building Linux binary..."
-	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
+	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
 # for testing proxy.Process
 simple-responder:
@@ -55,9 +52,12 @@ release:
 		echo "Error: There are unstaged changes. Please commit or stash your changes before creating a release tag." >&2; \
 		exit 1; \
 	fi
-	@echo "Creating release tag v$(COMMIT_COUNT)..."
+
-	git tag v$(COMMIT_COUNT)
+# Get the highest tag in v{number} format, increment it, and create a new tag
-	git push origin v$(COMMIT_COUNT)
+	@highest_tag=$$(git tag --sort=-v:refname | grep -E '^v[0-9]+$$' | head -n 1 || echo "v0"); \
 	new_tag="v$$(( $${highest_tag#v} + 1 ))"; \
 	echo "tagging new version: $$new_tag"; \
 	git tag "$$new_tag";
 # Phony targets
 .PHONY: all clean osx linux
@@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
 Features:
 - ✅ Easy to deploy: single binary with no dependencies
- ✅ Single yaml configuration file
+- ✅ Easy to config: single yaml file
 - ✅ On-demand model switching
 - ✅ Full control over server settings per model
 - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
@@ -16,7 +16,8 @@ Features:
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
 - ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`
 ## Releases
@@ -73,6 +74,12 @@ models:
      --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
    proxy: http://127.0.0.1:8999
  # unlisted models do not show up in /v1/models or /upstream lists
  # but they can still be requested as normal
  "qwen-unlisted":
    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
    unlisted: true
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
@@ -33,6 +33,7 @@ models:
      - env1=hello
    cmd: build/simple-responder --port 8999
    proxy: http://127.0.0.1:8999
    unlisted: true
    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
@@ -42,9 +43,11 @@ models:
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
    unlisted: true
 # creating a coding profile with models for code generation and general questions
 profiles:
@@ -22,7 +22,7 @@ func main() {
 	flag.Parse() // Parse the command-line flags
 	if *showVersion {
-		fmt.Printf("version: v%s (%s), built at %s\n", version, commit, date)
+		fmt.Printf("version: %s (%s), built at %s\n", version, commit, date)
 		os.Exit(0)
 	}
@@ -16,6 +16,7 @@ type ModelConfig struct {
 	Env           []string `yaml:"env"`
 	CheckEndpoint string   `yaml:"checkEndpoint"`
 	UnloadAfter   int      `yaml:"ttl"`
 	Unlisted      bool     `yaml:"unlisted"`
 }
 func (m *ModelConfig) SanitizedCommand() ([]string, error) {
@@ -46,7 +46,7 @@ func (w *LogMonitor) Write(p []byte) (n int, err error) {
 	w.buffer = w.buffer.Next()
 	w.bufferMu.Unlock()
-	w.broadcast(p)
+	w.broadcast(bufferCopy)
 	return n, nil
 }
@@ -125,11 +125,15 @@ func (p *Process) start() error {
 			maxDuration := time.Duration(p.config.UnloadAfter) * time.Second
 			for range time.Tick(time.Second) {
 				if p.state != StateReady {
 					return
 				}
 				// wait for all inflight requests to complete and ticker
 				p.inFlightRequests.Wait()
 				if time.Since(p.lastRequestHandled) > maxDuration {
-					fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %d reached.\n", p.ID, p.config.UnloadAfter)
+					fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
 					p.Stop()
 				}
 			}
@@ -162,25 +166,25 @@ func (p *Process) Stop() {
 	// will be a source of pain in the future.
 	p.cmd.Process.Signal(syscall.SIGTERM)
-	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
-	done := make(chan error, 1)
+	sigtermNormal := make(chan error, 1)
 	go func() {
-		done <- p.cmd.Wait()
+		sigtermNormal <- p.cmd.Wait()
 	}()
 	select {
-	case <-ctx.Done():
+	case <-sigtermTimeout.Done():
-		fmt.Printf("!!! process for %s timed out waiting to stop\n", p.ID)
+		fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
 		p.cmd.Process.Kill()
 		p.cmd.Wait()
-	case err := <-done:
+	case err := <-sigtermNormal:
 		if err != nil {
 			if err.Error() != "wait: no child processes" {
 				// possible that simple-responder for testing is just not
 				// existing right, so suppress those errors.
-				fmt.Printf("!!! process for %s stopped with error > %v\n", p.ID, err)
+				fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
 			}
 		}
 	}
@@ -2,10 +2,12 @@ package proxy
 import (
 	"bytes"
 	"embed"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -18,6 +20,15 @@ const (
 	PROFILE_SPLIT_CHAR = ":"
 )
 //go:embed html/favicon.ico
 var faviconData []byte
 //go:embed html/logs.html
 var logsHTML []byte
 // make sure embed is kept there by the IDE auto-package importer
 var _ = embed.FS{}
 type ProxyManager struct {
 	sync.Mutex
@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
 	pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
 	pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
-	pm.ginEngine.NoRoute(pm.proxyNoRouteHandler)
+	pm.ginEngine.GET("/upstream", pm.upstreamIndex)
 	pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
 	pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
 		c.Data(http.StatusOK, "image/x-icon", faviconData)
 	})
 	// Disable console color for testing
 	gin.DisableConsoleColor()
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {
 func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
 	data := []interface{}{}
-	for id := range pm.config.Models {
+	for id, modelConfig := range pm.config.Models {
 		if modelConfig.Unlisted {
 			continue
 		}
 		data = append(data, map[string]interface{}{
 			"id":       id,
 			"object":   "model",
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	pm.Lock()
 	defer pm.Unlock()
-	// Check if requestedModel contains a /
+	// Check if requestedModel contains a PROFILE_SPLIT_CHAR
 	profileName, modelName := "", requestedModel
 	if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
 		profileName = requestedModel[:idx]
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	return pm.currentProcesses[requestedProcessKey], nil
 }
 func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
 	requestedModel := c.Param("model_id")
 	if requestedModel == "" {
 		c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
 		return
 	}
 	if process, err := pm.swapModel(requestedModel); err != nil {
 		c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
 	} else {
 		// rewrite the path
 		c.Request.URL.Path = c.Param("upstreamPath")
 		process.ProxyRequest(c.Writer, c.Request)
 	}
 }
 func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
 	var html strings.Builder
 	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
 	// Extract keys and sort them
 	var modelIDs []string
 	for modelID, modelConfig := range pm.config.Models {
 		if modelConfig.Unlisted {
 			continue
 		}
 		modelIDs = append(modelIDs, modelID)
 	}
 	sort.Strings(modelIDs)
 	// Iterate over sorted keys
 	for _, modelID := range modelIDs {
 		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
 	}
 	html.WriteString("</ul></body></html>")
 	c.Header("Content-Type", "text/html")
 	c.String(http.StatusOK, html.String())
 }
 func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	bodyBytes, err := io.ReadAll(c.Request.Body)
 	if err != nil {
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	}
 }
 func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
 	// since maps are unordered, just use the first available process if one exists
 	for _, process := range pm.currentProcesses {
 		process.ProxyRequest(c.Writer, c.Request)
 		return
 	}
 	c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
 }
 func ProcessKeyName(groupName, modelName string) string {
 	return groupName + PROFILE_SPLIT_CHAR + modelName
 }
@@ -1,7 +1,6 @@
 package proxy
 import (
 	"embed"
 	"fmt"
 	"net/http"
 	"strings"
@@ -9,12 +8,6 @@ import (
 	"github.com/gin-gonic/gin"
 )
 //go:embed html/logs.html
 var logsHTML []byte
 // make sure embed is kept there by the IDE auto-package importer
 var _ = embed.FS{}
 func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
 	accept := c.GetHeader("Accept")
Author	SHA1	Message	Date
Benson Wong	891f6a5b5a	Add /upstream endpoint (#30 ) * remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico	2024-12-17 14:37:44 -08:00
Benson Wong	7183f6b43d	fix bad logging due to wrong []byte used #28	2024-12-16 16:22:14 -08:00
Benson Wong	d89bfeb441	add .DS_Store to .gitignore	2024-12-16 12:30:31 -08:00
Benson Wong	9a0c6bed40	Improve stop exceptions (#28 ) (#29 ) Stop Process TTL goroutine when process is not ready (#28) - fix issue where the goroutine will continue even though the child process is no longer running and the Process' state is not Ready - fix issue where some logs were going to stdout instead of p.logMonitor causing them to not show up in the /logs - add units to unloading model message	2024-12-16 12:29:25 -08:00
Benson Wong	d6ca535939	tweak release tagging so it is not based on number of commits	2024-12-14 15:46:10 -08:00