Compare commits

...

5 Commits

Author SHA1 Message Date
Benson Wong 891f6a5b5a Add /upstream endpoint (#30)
* remove catch-all route to upstream proxy (it was broken anyways)
* add /upstream/:model_id to swap and route to upstream path
* add /upstream HTML endpoint and unlisted option
* add /upstream endpoint to show a list of available models
* add `unlisted` configuration option to omit a model from /v1/models and /upstream lists
* add favicon.ico
2024-12-17 14:37:44 -08:00
Benson Wong 7183f6b43d fix bad logging due to wrong []byte used #28 2024-12-16 16:22:14 -08:00
Benson Wong d89bfeb441 add .DS_Store to .gitignore 2024-12-16 12:30:31 -08:00
Benson Wong 9a0c6bed40 Improve stop exceptions (#28) (#29)
Stop Process TTL goroutine when process is not ready (#28)

- fix issue where the goroutine will continue even though the child
  process is no longer running and the Process' state is not Ready
- fix issue where some logs were going to stdout instead of p.logMonitor
  causing them to not show up in the /logs
- add units to unloading model message
2024-12-16 12:29:25 -08:00
Benson Wong d6ca535939 tweak release tagging so it is not based on number of commits 2024-12-14 15:46:10 -08:00
12 changed files with 102 additions and 41 deletions
+2 -1
View File
@@ -2,4 +2,5 @@
.env .env
build/ build/
dist/ dist/
.vscode .vscode
.DS_Store
+8 -8
View File
@@ -9,9 +9,6 @@ ifneq ($(shell git status --porcelain),)
GIT_HASH := $(GIT_HASH)+ GIT_HASH := $(GIT_HASH)+
endif endif
# Get the build number from the commit count on the main branch
COMMIT_COUNT := $(shell git rev-list --count HEAD)
# Capture the current build date in RFC3339 format # Capture the current build date in RFC3339 format
BUILD_DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") BUILD_DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
@@ -31,12 +28,12 @@ test-all:
# Build OSX binary # Build OSX binary
mac: mac:
@echo "Building Mac binary..." @echo "Building Mac binary..."
GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64 GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
# Build Linux binary # Build Linux binary
linux: linux:
@echo "Building Linux binary..." @echo "Building Linux binary..."
GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64 GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
# for testing proxy.Process # for testing proxy.Process
simple-responder: simple-responder:
@@ -55,9 +52,12 @@ release:
echo "Error: There are unstaged changes. Please commit or stash your changes before creating a release tag." >&2; \ echo "Error: There are unstaged changes. Please commit or stash your changes before creating a release tag." >&2; \
exit 1; \ exit 1; \
fi fi
@echo "Creating release tag v$(COMMIT_COUNT)..."
git tag v$(COMMIT_COUNT) # Get the highest tag in v{number} format, increment it, and create a new tag
git push origin v$(COMMIT_COUNT) @highest_tag=$$(git tag --sort=-v:refname | grep -E '^v[0-9]+$$' | head -n 1 || echo "v0"); \
new_tag="v$$(( $${highest_tag#v} + 1 ))"; \
echo "tagging new version: $$new_tag"; \
git tag "$$new_tag";
# Phony targets # Phony targets
.PHONY: all clean osx linux .PHONY: all clean osx linux
+9 -2
View File
@@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
Features: Features:
- ✅ Easy to deploy: single binary with no dependencies - ✅ Easy to deploy: single binary with no dependencies
-Single yaml configuration file -Easy to config: single yaml file
- ✅ On-demand model switching - ✅ On-demand model switching
- ✅ Full control over server settings per model - ✅ Full control over server settings per model
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`) - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
@@ -16,7 +16,8 @@ Features:
- ✅ Run multiple models at once with `profiles` - ✅ Run multiple models at once with `profiles`
- ✅ Remote log monitoring at `/log` - ✅ Remote log monitoring at `/log`
- ✅ Automatic unloading of models from GPUs after timeout - ✅ Automatic unloading of models from GPUs after timeout
- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc) - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`
## Releases ## Releases
@@ -73,6 +74,12 @@ models:
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
proxy: http://127.0.0.1:8999 proxy: http://127.0.0.1:8999
# unlisted models do not show up in /v1/models or /upstream lists
# but they can still be requested as normal
"qwen-unlisted":
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
unlisted: true
# profiles make it easy to managing multi model (and gpu) configurations. # profiles make it easy to managing multi model (and gpu) configurations.
# #
# Tips: # Tips:
+3
View File
@@ -33,6 +33,7 @@ models:
- env1=hello - env1=hello
cmd: build/simple-responder --port 8999 cmd: build/simple-responder --port 8999
proxy: http://127.0.0.1:8999 proxy: http://127.0.0.1:8999
unlisted: true
# use "none" to skip check. Caution this may cause some requests to fail # use "none" to skip check. Caution this may cause some requests to fail
# until the upstream server is ready for traffic # until the upstream server is ready for traffic
@@ -42,9 +43,11 @@ models:
"broken": "broken":
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
proxy: http://127.0.0.1:8999 proxy: http://127.0.0.1:8999
unlisted: true
"broken_timeout": "broken_timeout":
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
proxy: http://127.0.0.1:9000 proxy: http://127.0.0.1:9000
unlisted: true
# creating a coding profile with models for code generation and general questions # creating a coding profile with models for code generation and general questions
profiles: profiles:
+1 -1
View File
@@ -22,7 +22,7 @@ func main() {
flag.Parse() // Parse the command-line flags flag.Parse() // Parse the command-line flags
if *showVersion { if *showVersion {
fmt.Printf("version: v%s (%s), built at %s\n", version, commit, date) fmt.Printf("version: %s (%s), built at %s\n", version, commit, date)
os.Exit(0) os.Exit(0)
} }
Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

+1
View File
@@ -16,6 +16,7 @@ type ModelConfig struct {
Env []string `yaml:"env"` Env []string `yaml:"env"`
CheckEndpoint string `yaml:"checkEndpoint"` CheckEndpoint string `yaml:"checkEndpoint"`
UnloadAfter int `yaml:"ttl"` UnloadAfter int `yaml:"ttl"`
Unlisted bool `yaml:"unlisted"`
} }
func (m *ModelConfig) SanitizedCommand() ([]string, error) { func (m *ModelConfig) SanitizedCommand() ([]string, error) {
Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

+1 -1
View File
@@ -46,7 +46,7 @@ func (w *LogMonitor) Write(p []byte) (n int, err error) {
w.buffer = w.buffer.Next() w.buffer = w.buffer.Next()
w.bufferMu.Unlock() w.bufferMu.Unlock()
w.broadcast(p) w.broadcast(bufferCopy)
return n, nil return n, nil
} }
+12 -8
View File
@@ -125,11 +125,15 @@ func (p *Process) start() error {
maxDuration := time.Duration(p.config.UnloadAfter) * time.Second maxDuration := time.Duration(p.config.UnloadAfter) * time.Second
for range time.Tick(time.Second) { for range time.Tick(time.Second) {
if p.state != StateReady {
return
}
// wait for all inflight requests to complete and ticker // wait for all inflight requests to complete and ticker
p.inFlightRequests.Wait() p.inFlightRequests.Wait()
if time.Since(p.lastRequestHandled) > maxDuration { if time.Since(p.lastRequestHandled) > maxDuration {
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %d reached.\n", p.ID, p.config.UnloadAfter) fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
p.Stop() p.Stop()
} }
} }
@@ -162,25 +166,25 @@ func (p *Process) Stop() {
// will be a source of pain in the future. // will be a source of pain in the future.
p.cmd.Process.Signal(syscall.SIGTERM) p.cmd.Process.Signal(syscall.SIGTERM)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel() defer cancel()
done := make(chan error, 1) sigtermNormal := make(chan error, 1)
go func() { go func() {
done <- p.cmd.Wait() sigtermNormal <- p.cmd.Wait()
}() }()
select { select {
case <-ctx.Done(): case <-sigtermTimeout.Done():
fmt.Printf("!!! process for %s timed out waiting to stop\n", p.ID) fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
p.cmd.Process.Kill() p.cmd.Process.Kill()
p.cmd.Wait() p.cmd.Wait()
case err := <-done: case err := <-sigtermNormal:
if err != nil { if err != nil {
if err.Error() != "wait: no child processes" { if err.Error() != "wait: no child processes" {
// possible that simple-responder for testing is just not // possible that simple-responder for testing is just not
// existing right, so suppress those errors. // existing right, so suppress those errors.
fmt.Printf("!!! process for %s stopped with error > %v\n", p.ID, err) fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
} }
} }
} }
+65 -13
View File
@@ -2,10 +2,12 @@ package proxy
import ( import (
"bytes" "bytes"
"embed"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
"sort"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -18,6 +20,15 @@ const (
PROFILE_SPLIT_CHAR = ":" PROFILE_SPLIT_CHAR = ":"
) )
//go:embed html/favicon.ico
var faviconData []byte
//go:embed html/logs.html
var logsHTML []byte
// make sure embed is kept there by the IDE auto-package importer
var _ = embed.FS{}
type ProxyManager struct { type ProxyManager struct {
sync.Mutex sync.Mutex
@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler) pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE) pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
pm.ginEngine.NoRoute(pm.proxyNoRouteHandler) pm.ginEngine.GET("/upstream", pm.upstreamIndex)
pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
c.Data(http.StatusOK, "image/x-icon", faviconData)
})
// Disable console color for testing // Disable console color for testing
gin.DisableConsoleColor() gin.DisableConsoleColor()
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {
func (pm *ProxyManager) listModelsHandler(c *gin.Context) { func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
data := []interface{}{} data := []interface{}{}
for id := range pm.config.Models { for id, modelConfig := range pm.config.Models {
if modelConfig.Unlisted {
continue
}
data = append(data, map[string]interface{}{ data = append(data, map[string]interface{}{
"id": id, "id": id,
"object": "model", "object": "model",
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
pm.Lock() pm.Lock()
defer pm.Unlock() defer pm.Unlock()
// Check if requestedModel contains a / // Check if requestedModel contains a PROFILE_SPLIT_CHAR
profileName, modelName := "", requestedModel profileName, modelName := "", requestedModel
if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 { if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
profileName = requestedModel[:idx] profileName = requestedModel[:idx]
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
return pm.currentProcesses[requestedProcessKey], nil return pm.currentProcesses[requestedProcessKey], nil
} }
func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
requestedModel := c.Param("model_id")
if requestedModel == "" {
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
return
}
if process, err := pm.swapModel(requestedModel); err != nil {
c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
} else {
// rewrite the path
c.Request.URL.Path = c.Param("upstreamPath")
process.ProxyRequest(c.Writer, c.Request)
}
}
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
var html strings.Builder
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
// Extract keys and sort them
var modelIDs []string
for modelID, modelConfig := range pm.config.Models {
if modelConfig.Unlisted {
continue
}
modelIDs = append(modelIDs, modelID)
}
sort.Strings(modelIDs)
// Iterate over sorted keys
for _, modelID := range modelIDs {
html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
}
html.WriteString("</ul></body></html>")
c.Header("Content-Type", "text/html")
c.String(http.StatusOK, html.String())
}
func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) { func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
bodyBytes, err := io.ReadAll(c.Request.Body) bodyBytes, err := io.ReadAll(c.Request.Body)
if err != nil { if err != nil {
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
} }
} }
func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
// since maps are unordered, just use the first available process if one exists
for _, process := range pm.currentProcesses {
process.ProxyRequest(c.Writer, c.Request)
return
}
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
}
func ProcessKeyName(groupName, modelName string) string { func ProcessKeyName(groupName, modelName string) string {
return groupName + PROFILE_SPLIT_CHAR + modelName return groupName + PROFILE_SPLIT_CHAR + modelName
} }
-7
View File
@@ -1,7 +1,6 @@
package proxy package proxy
import ( import (
"embed"
"fmt" "fmt"
"net/http" "net/http"
"strings" "strings"
@@ -9,12 +8,6 @@ import (
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
) )
//go:embed html/logs.html
var logsHTML []byte
// make sure embed is kept there by the IDE auto-package importer
var _ = embed.FS{}
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) { func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
accept := c.GetHeader("Accept") accept := c.GetHeader("Accept")