Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 84b667ca7a | |||
| 29657106fc | |||
| 9c8860471e | |||
| 9b4e3f307e | |||
| 6fe37c3abf | |||
| 7f45493a37 |
@@ -11,13 +11,13 @@ Features:
|
|||||||
- ✅ Easy to config: single yaml file
|
- ✅ Easy to config: single yaml file
|
||||||
- ✅ On-demand model switching
|
- ✅ On-demand model switching
|
||||||
- ✅ Full control over server settings per model
|
- ✅ Full control over server settings per model
|
||||||
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
- ✅ OpenAI API support (`v1/completions`, `v1/chat/completions`, `v1/embeddings` and `v1/rerank`)
|
||||||
- ✅ Multiple GPU support
|
- ✅ Multiple GPU support
|
||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
- ✅ Automatic unloading of models from GPUs after timeout
|
- ✅ Automatic unloading of models from GPUs after timeout
|
||||||
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
|
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
|
||||||
- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`
|
- ✅ Direct access to upstream HTTP server via `/upstream/:model_id` ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
|
||||||
|
|
||||||
## Releases
|
## Releases
|
||||||
|
|
||||||
@@ -39,6 +39,9 @@ llama-swap's configuration is purposefully simple.
|
|||||||
# Default (and minimum) is 15 seconds
|
# Default (and minimum) is 15 seconds
|
||||||
healthCheckTimeout: 60
|
healthCheckTimeout: 60
|
||||||
|
|
||||||
|
# Write HTTP logs (useful for troubleshooting), defaults to false
|
||||||
|
logRequests: true
|
||||||
|
|
||||||
# define valid model values and the upstream server start
|
# define valid model values and the upstream server start
|
||||||
models:
|
models:
|
||||||
"llama":
|
"llama":
|
||||||
@@ -92,7 +95,11 @@ profiles:
|
|||||||
- "llama"
|
- "llama"
|
||||||
```
|
```
|
||||||
|
|
||||||
More [examples](examples/README.md) are available for different use cases.
|
**Advanced examples**
|
||||||
|
|
||||||
|
- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
|
||||||
|
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
|
||||||
|
- [Optimizing Code Generation](examples/benchmark-snakegame/README.md) - find the optimal settings for your machine. This example demonstrates defining multiple configurations and testing which one is fastest.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,9 @@
|
|||||||
# Default (and minimum): 15 seconds
|
# Default (and minimum): 15 seconds
|
||||||
healthCheckTimeout: 15
|
healthCheckTimeout: 15
|
||||||
|
|
||||||
|
# Log HTTP requests helpful for troubleshoot, defaults to False
|
||||||
|
logRequests: true
|
||||||
|
|
||||||
models:
|
models:
|
||||||
"llama":
|
"llama":
|
||||||
cmd: >
|
cmd: >
|
||||||
@@ -26,6 +29,31 @@ models:
|
|||||||
aliases:
|
aliases:
|
||||||
- gpt-3.5-turbo
|
- gpt-3.5-turbo
|
||||||
|
|
||||||
|
# Embedding example with Nomic
|
||||||
|
# https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
|
||||||
|
"nomic":
|
||||||
|
proxy: http://127.0.0.1:9005
|
||||||
|
cmd: >
|
||||||
|
models/llama-server-osx --port 9005
|
||||||
|
-m models/nomic-embed-text-v1.5.Q8_0.gguf
|
||||||
|
--ctx-size 8192
|
||||||
|
--batch-size 8192
|
||||||
|
--rope-scaling yarn
|
||||||
|
--rope-freq-scale 0.75
|
||||||
|
-ngl 99
|
||||||
|
--embeddings
|
||||||
|
|
||||||
|
# Reranking example with bge-reranker
|
||||||
|
# https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
|
||||||
|
"bge-reranker":
|
||||||
|
proxy: http://127.0.0.1:9006
|
||||||
|
cmd: >
|
||||||
|
models/llama-server-osx --port 9006
|
||||||
|
-m models/bge-reranker-v2-m3-Q4_K_M.gguf
|
||||||
|
--ctx-size 8192
|
||||||
|
--reranking
|
||||||
|
|
||||||
|
|
||||||
"simple":
|
"simple":
|
||||||
# example of setting environment variables
|
# example of setting environment variables
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
The rerank-test.json data is from https://github.com/ggerganov/llama.cpp/pull/9510
|
||||||
|
|
||||||
|
To run it:
|
||||||
|
> curl http://127.0.0.1:8080/v1/rerank -H "Content-Type: application/json" -d @reranker-test.json -v | jq .
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"model": "bge-reranker",
|
||||||
|
"query": "Organic skincare products for sensitive skin",
|
||||||
|
"top_n": 3,
|
||||||
|
"documents": [
|
||||||
|
"Organic skincare for sensitive skin with aloe vera and chamomile: Imagine the soothing embrace of nature with our organic skincare range, crafted specifically for sensitive skin. Infused with the calming properties of aloe vera and chamomile, each product provides gentle nourishment and protection. Say goodbye to irritation and hello to a glowing, healthy complexion.",
|
||||||
|
"New makeup trends focus on bold colors and innovative techniques: Step into the world of cutting-edge beauty with this seasons makeup trends. Bold, vibrant colors and groundbreaking techniques are redefining the art of makeup. From neon eyeliners to holographic highlighters, unleash your creativity and make a statement with every look.",
|
||||||
|
"Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille: Erleben Sie die wohltuende Wirkung unserer Bio-Hautpflege, speziell für empfindliche Haut entwickelt. Mit den beruhigenden Eigenschaften von Aloe Vera und Kamille pflegen und schützen unsere Produkte Ihre Haut auf natürliche Weise. Verabschieden Sie sich von Hautirritationen und genießen Sie einen strahlenden Teint.",
|
||||||
|
"Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken: Tauchen Sie ein in die Welt der modernen Schönheit mit den neuesten Make-up-Trends. Kräftige, lebendige Farben und innovative Techniken setzen neue Maßstäbe. Von auffälligen Eyelinern bis hin zu holografischen Highlightern – lassen Sie Ihrer Kreativität freien Lauf und setzen Sie jedes Mal ein Statement.",
|
||||||
|
"Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla: Descubre el poder de la naturaleza con nuestra línea de cuidado de la piel orgánico, diseñada especialmente para pieles sensibles. Enriquecidos con aloe vera y manzanilla, estos productos ofrecen una hidratación y protección suave. Despídete de las irritaciones y saluda a una piel radiante y saludable.",
|
||||||
|
"Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras: Entra en el fascinante mundo del maquillaje con las tendencias más actuales. Colores vivos y técnicas innovadoras están revolucionando el arte del maquillaje. Desde delineadores neón hasta iluminadores holográficos, desata tu creatividad y destaca en cada look.",
|
||||||
|
"针对敏感肌专门设计的天然有机护肤产品:体验由芦荟和洋甘菊提取物带来的自然呵护。我们的护肤产品特别为敏感肌设计,温和滋润,保护您的肌肤不受刺激。让您的肌肤告别不适,迎来健康光彩。",
|
||||||
|
"新的化妆趋势注重鲜艳的颜色和创新的技巧:进入化妆艺术的新纪元,本季的化妆趋势以大胆的颜色和创新的技巧为主。无论是霓虹眼线还是全息高光,每一款妆容都能让您脱颖而出,展现独特魅力。",
|
||||||
|
"敏感肌のために特別に設計された天然有機スキンケア製品: アロエベラとカモミールのやさしい力で、自然の抱擁を感じてください。敏感肌用に特別に設計された私たちのスキンケア製品は、肌に優しく栄養を与え、保護します。肌トラブルにさようなら、輝く健康な肌にこんにちは。",
|
||||||
|
"新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています: 今シーズンのメイクアップトレンドは、大胆な色彩と革新的な技術に注目しています。ネオンアイライナーからホログラフィックハイライターまで、クリエイティビティを解き放ち、毎回ユニークなルックを演出しましょう。"
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -25,6 +25,7 @@ func (m *ModelConfig) SanitizedCommand() ([]string, error) {
|
|||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
HealthCheckTimeout int `yaml:"healthCheckTimeout"`
|
HealthCheckTimeout int `yaml:"healthCheckTimeout"`
|
||||||
|
LogRequests bool `yaml:"logRequests"`
|
||||||
Models map[string]ModelConfig `yaml:"models"`
|
Models map[string]ModelConfig `yaml:"models"`
|
||||||
Profiles map[string][]string `yaml:"profiles"`
|
Profiles map[string][]string `yaml:"profiles"`
|
||||||
|
|
||||||
|
|||||||
+57
-11
@@ -46,11 +46,46 @@ func New(config *Config) *ProxyManager {
|
|||||||
ginEngine: gin.New(),
|
ginEngine: gin.New(),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set up routes using the Gin engine
|
if config.LogRequests {
|
||||||
pm.ginEngine.POST("/v1/chat/completions", pm.proxyChatRequestHandler)
|
pm.ginEngine.Use(func(c *gin.Context) {
|
||||||
|
// Start timer
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
// Process request
|
||||||
|
c.Next()
|
||||||
|
|
||||||
|
// Stop timer
|
||||||
|
duration := time.Since(start)
|
||||||
|
|
||||||
|
// Log request details
|
||||||
|
clientIP := c.ClientIP()
|
||||||
|
method := c.Request.Method
|
||||||
|
path := c.Request.URL.Path
|
||||||
|
statusCode := c.Writer.Status()
|
||||||
|
bodySize := c.Writer.Size()
|
||||||
|
|
||||||
|
fmt.Fprintf(pm.logMonitor, "[llama-swap] %s [%s] \"%s %s %s\" %d %d \"%s\" %v\n",
|
||||||
|
clientIP,
|
||||||
|
time.Now().Format("2006-01-02 15:04:05"),
|
||||||
|
method,
|
||||||
|
path,
|
||||||
|
c.Request.Proto,
|
||||||
|
statusCode,
|
||||||
|
bodySize,
|
||||||
|
c.Request.UserAgent(),
|
||||||
|
duration,
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set up routes using the Gin engine
|
||||||
|
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
||||||
// Support legacy /v1/completions api, see issue #12
|
// Support legacy /v1/completions api, see issue #12
|
||||||
pm.ginEngine.POST("/v1/completions", pm.proxyChatRequestHandler)
|
pm.ginEngine.POST("/v1/completions", pm.proxyOAIHandler)
|
||||||
|
|
||||||
|
// Support embeddings
|
||||||
|
pm.ginEngine.POST("/v1/embeddings", pm.proxyOAIHandler)
|
||||||
|
pm.ginEngine.POST("/v1/rerank", pm.proxyOAIHandler)
|
||||||
|
|
||||||
pm.ginEngine.GET("/v1/models", pm.listModelsHandler)
|
pm.ginEngine.GET("/v1/models", pm.listModelsHandler)
|
||||||
|
|
||||||
@@ -124,7 +159,7 @@ func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
|
|||||||
|
|
||||||
// Encode the data as JSON and write it to the response writer
|
// Encode the data as JSON and write it to the response writer
|
||||||
if err := json.NewEncoder(c.Writer).Encode(map[string]interface{}{"data": data}); err != nil {
|
if err := json.NewEncoder(c.Writer).Encode(map[string]interface{}{"data": data}); err != nil {
|
||||||
c.AbortWithError(http.StatusInternalServerError, fmt.Errorf("error encoding JSON"))
|
pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error encoding JSON %s", err.Error()))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -194,12 +229,12 @@ func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
|
|||||||
requestedModel := c.Param("model_id")
|
requestedModel := c.Param("model_id")
|
||||||
|
|
||||||
if requestedModel == "" {
|
if requestedModel == "" {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, "model id required in path")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if process, err := pm.swapModel(requestedModel); err != nil {
|
if process, err := pm.swapModel(requestedModel); err != nil {
|
||||||
c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
|
pm.sendErrorResponse(c, http.StatusNotFound, fmt.Sprintf("unable to swap to model, %s", err.Error()))
|
||||||
} else {
|
} else {
|
||||||
// rewrite the path
|
// rewrite the path
|
||||||
c.Request.URL.Path = c.Param("upstreamPath")
|
c.Request.URL.Path = c.Param("upstreamPath")
|
||||||
@@ -232,25 +267,26 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
|||||||
c.String(http.StatusOK, html.String())
|
c.String(http.StatusOK, html.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
|
||||||
bodyBytes, err := io.ReadAll(c.Request.Body)
|
bodyBytes, err := io.ReadAll(c.Request.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("invalid JSON"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
var requestBody map[string]interface{}
|
var requestBody map[string]interface{}
|
||||||
if err := json.Unmarshal(bodyBytes, &requestBody); err != nil {
|
if err := json.Unmarshal(bodyBytes, &requestBody); err != nil {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("invalid JSON"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("invalid JSON: %s", err.Error()))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
model, ok := requestBody["model"].(string)
|
model, ok := requestBody["model"].(string)
|
||||||
if !ok {
|
if !ok {
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("missing or invalid 'model' key"))
|
pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if process, err := pm.swapModel(model); err != nil {
|
if process, err := pm.swapModel(model); err != nil {
|
||||||
c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
|
pm.sendErrorResponse(c, http.StatusNotFound, fmt.Sprintf("unable to swap to model, %s", err.Error()))
|
||||||
return
|
return
|
||||||
} else {
|
} else {
|
||||||
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
|
||||||
@@ -263,6 +299,16 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pm *ProxyManager) sendErrorResponse(c *gin.Context, statusCode int, message string) {
|
||||||
|
acceptHeader := c.GetHeader("Accept")
|
||||||
|
|
||||||
|
if strings.Contains(acceptHeader, "application/json") {
|
||||||
|
c.JSON(statusCode, gin.H{"error": message})
|
||||||
|
} else {
|
||||||
|
c.String(statusCode, message)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func ProcessKeyName(groupName, modelName string) string {
|
func ProcessKeyName(groupName, modelName string) string {
|
||||||
return groupName + PROFILE_SPLIT_CHAR + modelName
|
return groupName + PROFILE_SPLIT_CHAR + modelName
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user