Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 891f6a5b5a | |||
| 7183f6b43d | |||
| d89bfeb441 | |||
| 9a0c6bed40 | |||
| d6ca535939 |
+2
-1
@@ -2,4 +2,5 @@
|
|||||||
.env
|
.env
|
||||||
build/
|
build/
|
||||||
dist/
|
dist/
|
||||||
.vscode
|
.vscode
|
||||||
|
.DS_Store
|
||||||
|
|||||||
@@ -9,9 +9,6 @@ ifneq ($(shell git status --porcelain),)
|
|||||||
GIT_HASH := $(GIT_HASH)+
|
GIT_HASH := $(GIT_HASH)+
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Get the build number from the commit count on the main branch
|
|
||||||
COMMIT_COUNT := $(shell git rev-list --count HEAD)
|
|
||||||
|
|
||||||
# Capture the current build date in RFC3339 format
|
# Capture the current build date in RFC3339 format
|
||||||
BUILD_DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
|
BUILD_DATE := $(shell date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
@@ -31,12 +28,12 @@ test-all:
|
|||||||
# Build OSX binary
|
# Build OSX binary
|
||||||
mac:
|
mac:
|
||||||
@echo "Building Mac binary..."
|
@echo "Building Mac binary..."
|
||||||
GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
|
GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
|
||||||
|
|
||||||
# Build Linux binary
|
# Build Linux binary
|
||||||
linux:
|
linux:
|
||||||
@echo "Building Linux binary..."
|
@echo "Building Linux binary..."
|
||||||
GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=${COMMIT_COUNT} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
|
GOOS=linux GOARCH=amd64 go build -ldflags="-X main.commit=${GIT_HASH} -X main.version=local_${GIT_HASH} -X main.date=${BUILD_DATE}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
|
||||||
|
|
||||||
# for testing proxy.Process
|
# for testing proxy.Process
|
||||||
simple-responder:
|
simple-responder:
|
||||||
@@ -55,9 +52,12 @@ release:
|
|||||||
echo "Error: There are unstaged changes. Please commit or stash your changes before creating a release tag." >&2; \
|
echo "Error: There are unstaged changes. Please commit or stash your changes before creating a release tag." >&2; \
|
||||||
exit 1; \
|
exit 1; \
|
||||||
fi
|
fi
|
||||||
@echo "Creating release tag v$(COMMIT_COUNT)..."
|
|
||||||
git tag v$(COMMIT_COUNT)
|
# Get the highest tag in v{number} format, increment it, and create a new tag
|
||||||
git push origin v$(COMMIT_COUNT)
|
@highest_tag=$$(git tag --sort=-v:refname | grep -E '^v[0-9]+$$' | head -n 1 || echo "v0"); \
|
||||||
|
new_tag="v$$(( $${highest_tag#v} + 1 ))"; \
|
||||||
|
echo "tagging new version: $$new_tag"; \
|
||||||
|
git tag "$$new_tag";
|
||||||
|
|
||||||
# Phony targets
|
# Phony targets
|
||||||
.PHONY: all clean osx linux
|
.PHONY: all clean osx linux
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
|
|||||||
Features:
|
Features:
|
||||||
|
|
||||||
- ✅ Easy to deploy: single binary with no dependencies
|
- ✅ Easy to deploy: single binary with no dependencies
|
||||||
- ✅ Single yaml configuration file
|
- ✅ Easy to config: single yaml file
|
||||||
- ✅ On-demand model switching
|
- ✅ On-demand model switching
|
||||||
- ✅ Full control over server settings per model
|
- ✅ Full control over server settings per model
|
||||||
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
||||||
@@ -16,7 +16,8 @@ Features:
|
|||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
- ✅ Automatic unloading of models from GPUs after timeout
|
- ✅ Automatic unloading of models from GPUs after timeout
|
||||||
- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
|
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
|
||||||
|
- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`
|
||||||
|
|
||||||
## Releases
|
## Releases
|
||||||
|
|
||||||
@@ -73,6 +74,12 @@ models:
|
|||||||
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
||||||
proxy: http://127.0.0.1:8999
|
proxy: http://127.0.0.1:8999
|
||||||
|
|
||||||
|
# unlisted models do not show up in /v1/models or /upstream lists
|
||||||
|
# but they can still be requested as normal
|
||||||
|
"qwen-unlisted":
|
||||||
|
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||||
|
unlisted: true
|
||||||
|
|
||||||
# profiles make it easy to managing multi model (and gpu) configurations.
|
# profiles make it easy to managing multi model (and gpu) configurations.
|
||||||
#
|
#
|
||||||
# Tips:
|
# Tips:
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ models:
|
|||||||
- env1=hello
|
- env1=hello
|
||||||
cmd: build/simple-responder --port 8999
|
cmd: build/simple-responder --port 8999
|
||||||
proxy: http://127.0.0.1:8999
|
proxy: http://127.0.0.1:8999
|
||||||
|
unlisted: true
|
||||||
|
|
||||||
# use "none" to skip check. Caution this may cause some requests to fail
|
# use "none" to skip check. Caution this may cause some requests to fail
|
||||||
# until the upstream server is ready for traffic
|
# until the upstream server is ready for traffic
|
||||||
@@ -42,9 +43,11 @@ models:
|
|||||||
"broken":
|
"broken":
|
||||||
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
||||||
proxy: http://127.0.0.1:8999
|
proxy: http://127.0.0.1:8999
|
||||||
|
unlisted: true
|
||||||
"broken_timeout":
|
"broken_timeout":
|
||||||
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
|
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
|
||||||
proxy: http://127.0.0.1:9000
|
proxy: http://127.0.0.1:9000
|
||||||
|
unlisted: true
|
||||||
|
|
||||||
# creating a coding profile with models for code generation and general questions
|
# creating a coding profile with models for code generation and general questions
|
||||||
profiles:
|
profiles:
|
||||||
|
|||||||
+1
-1
@@ -22,7 +22,7 @@ func main() {
|
|||||||
flag.Parse() // Parse the command-line flags
|
flag.Parse() // Parse the command-line flags
|
||||||
|
|
||||||
if *showVersion {
|
if *showVersion {
|
||||||
fmt.Printf("version: v%s (%s), built at %s\n", version, commit, date)
|
fmt.Printf("version: %s (%s), built at %s\n", version, commit, date)
|
||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 51 KiB |
@@ -16,6 +16,7 @@ type ModelConfig struct {
|
|||||||
Env []string `yaml:"env"`
|
Env []string `yaml:"env"`
|
||||||
CheckEndpoint string `yaml:"checkEndpoint"`
|
CheckEndpoint string `yaml:"checkEndpoint"`
|
||||||
UnloadAfter int `yaml:"ttl"`
|
UnloadAfter int `yaml:"ttl"`
|
||||||
|
Unlisted bool `yaml:"unlisted"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *ModelConfig) SanitizedCommand() ([]string, error) {
|
func (m *ModelConfig) SanitizedCommand() ([]string, error) {
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
+1
-1
@@ -46,7 +46,7 @@ func (w *LogMonitor) Write(p []byte) (n int, err error) {
|
|||||||
w.buffer = w.buffer.Next()
|
w.buffer = w.buffer.Next()
|
||||||
w.bufferMu.Unlock()
|
w.bufferMu.Unlock()
|
||||||
|
|
||||||
w.broadcast(p)
|
w.broadcast(bufferCopy)
|
||||||
return n, nil
|
return n, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+12
-8
@@ -125,11 +125,15 @@ func (p *Process) start() error {
|
|||||||
maxDuration := time.Duration(p.config.UnloadAfter) * time.Second
|
maxDuration := time.Duration(p.config.UnloadAfter) * time.Second
|
||||||
|
|
||||||
for range time.Tick(time.Second) {
|
for range time.Tick(time.Second) {
|
||||||
|
if p.state != StateReady {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// wait for all inflight requests to complete and ticker
|
// wait for all inflight requests to complete and ticker
|
||||||
p.inFlightRequests.Wait()
|
p.inFlightRequests.Wait()
|
||||||
|
|
||||||
if time.Since(p.lastRequestHandled) > maxDuration {
|
if time.Since(p.lastRequestHandled) > maxDuration {
|
||||||
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %d reached.\n", p.ID, p.config.UnloadAfter)
|
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
|
||||||
p.Stop()
|
p.Stop()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -162,25 +166,25 @@ func (p *Process) Stop() {
|
|||||||
// will be a source of pain in the future.
|
// will be a source of pain in the future.
|
||||||
|
|
||||||
p.cmd.Process.Signal(syscall.SIGTERM)
|
p.cmd.Process.Signal(syscall.SIGTERM)
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
done := make(chan error, 1)
|
sigtermNormal := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
done <- p.cmd.Wait()
|
sigtermNormal <- p.cmd.Wait()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-sigtermTimeout.Done():
|
||||||
fmt.Printf("!!! process for %s timed out waiting to stop\n", p.ID)
|
fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
|
||||||
p.cmd.Process.Kill()
|
p.cmd.Process.Kill()
|
||||||
p.cmd.Wait()
|
p.cmd.Wait()
|
||||||
case err := <-done:
|
case err := <-sigtermNormal:
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err.Error() != "wait: no child processes" {
|
if err.Error() != "wait: no child processes" {
|
||||||
// possible that simple-responder for testing is just not
|
// possible that simple-responder for testing is just not
|
||||||
// existing right, so suppress those errors.
|
// existing right, so suppress those errors.
|
||||||
fmt.Printf("!!! process for %s stopped with error > %v\n", p.ID, err)
|
fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
+65
-13
@@ -2,10 +2,12 @@ package proxy
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"embed"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -18,6 +20,15 @@ const (
|
|||||||
PROFILE_SPLIT_CHAR = ":"
|
PROFILE_SPLIT_CHAR = ":"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
//go:embed html/favicon.ico
|
||||||
|
var faviconData []byte
|
||||||
|
|
||||||
|
//go:embed html/logs.html
|
||||||
|
var logsHTML []byte
|
||||||
|
|
||||||
|
// make sure embed is kept there by the IDE auto-package importer
|
||||||
|
var _ = embed.FS{}
|
||||||
|
|
||||||
type ProxyManager struct {
|
type ProxyManager struct {
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
|
|
||||||
@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
|
|||||||
pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
|
pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
|
||||||
pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
|
pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
|
||||||
|
|
||||||
pm.ginEngine.NoRoute(pm.proxyNoRouteHandler)
|
pm.ginEngine.GET("/upstream", pm.upstreamIndex)
|
||||||
|
pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
|
||||||
|
|
||||||
|
pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
|
||||||
|
c.Data(http.StatusOK, "image/x-icon", faviconData)
|
||||||
|
})
|
||||||
|
|
||||||
// Disable console color for testing
|
// Disable console color for testing
|
||||||
gin.DisableConsoleColor()
|
gin.DisableConsoleColor()
|
||||||
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {
|
|||||||
|
|
||||||
func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
|
func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
|
||||||
data := []interface{}{}
|
data := []interface{}{}
|
||||||
for id := range pm.config.Models {
|
for id, modelConfig := range pm.config.Models {
|
||||||
|
if modelConfig.Unlisted {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
data = append(data, map[string]interface{}{
|
data = append(data, map[string]interface{}{
|
||||||
"id": id,
|
"id": id,
|
||||||
"object": "model",
|
"object": "model",
|
||||||
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
|
|||||||
pm.Lock()
|
pm.Lock()
|
||||||
defer pm.Unlock()
|
defer pm.Unlock()
|
||||||
|
|
||||||
// Check if requestedModel contains a /
|
// Check if requestedModel contains a PROFILE_SPLIT_CHAR
|
||||||
profileName, modelName := "", requestedModel
|
profileName, modelName := "", requestedModel
|
||||||
if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
|
if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
|
||||||
profileName = requestedModel[:idx]
|
profileName = requestedModel[:idx]
|
||||||
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
|
|||||||
return pm.currentProcesses[requestedProcessKey], nil
|
return pm.currentProcesses[requestedProcessKey], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
|
||||||
|
requestedModel := c.Param("model_id")
|
||||||
|
|
||||||
|
if requestedModel == "" {
|
||||||
|
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if process, err := pm.swapModel(requestedModel); err != nil {
|
||||||
|
c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
|
||||||
|
} else {
|
||||||
|
// rewrite the path
|
||||||
|
c.Request.URL.Path = c.Param("upstreamPath")
|
||||||
|
process.ProxyRequest(c.Writer, c.Request)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
||||||
|
var html strings.Builder
|
||||||
|
|
||||||
|
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
|
||||||
|
|
||||||
|
// Extract keys and sort them
|
||||||
|
var modelIDs []string
|
||||||
|
for modelID, modelConfig := range pm.config.Models {
|
||||||
|
if modelConfig.Unlisted {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
modelIDs = append(modelIDs, modelID)
|
||||||
|
}
|
||||||
|
sort.Strings(modelIDs)
|
||||||
|
|
||||||
|
// Iterate over sorted keys
|
||||||
|
for _, modelID := range modelIDs {
|
||||||
|
html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
|
||||||
|
}
|
||||||
|
html.WriteString("</ul></body></html>")
|
||||||
|
c.Header("Content-Type", "text/html")
|
||||||
|
c.String(http.StatusOK, html.String())
|
||||||
|
}
|
||||||
|
|
||||||
func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
||||||
bodyBytes, err := io.ReadAll(c.Request.Body)
|
bodyBytes, err := io.ReadAll(c.Request.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
|
|
||||||
// since maps are unordered, just use the first available process if one exists
|
|
||||||
for _, process := range pm.currentProcesses {
|
|
||||||
process.ProxyRequest(c.Writer, c.Request)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
|
|
||||||
}
|
|
||||||
|
|
||||||
func ProcessKeyName(groupName, modelName string) string {
|
func ProcessKeyName(groupName, modelName string) string {
|
||||||
return groupName + PROFILE_SPLIT_CHAR + modelName
|
return groupName + PROFILE_SPLIT_CHAR + modelName
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package proxy
|
package proxy
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -9,12 +8,6 @@ import (
|
|||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed html/logs.html
|
|
||||||
var logsHTML []byte
|
|
||||||
|
|
||||||
// make sure embed is kept there by the IDE auto-package importer
|
|
||||||
var _ = embed.FS{}
|
|
||||||
|
|
||||||
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
|
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
|
||||||
|
|
||||||
accept := c.GetHeader("Accept")
|
accept := c.GetHeader("Accept")
|
||||||
|
|||||||
Reference in New Issue
Block a user