Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 314d2f2212 | |||
| fad25f3e11 | |||
| 2c3e3e27f7 | |||
| baeb0c4e7f |
@@ -30,6 +30,7 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
|
|||||||
- `v1/rerank`
|
- `v1/rerank`
|
||||||
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
||||||
- ✅ Multiple GPU support
|
- ✅ Multiple GPU support
|
||||||
|
- ✅ Docker and Podman support
|
||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
- ✅ Automatic unloading of models from GPUs after timeout
|
- ✅ Automatic unloading of models from GPUs after timeout
|
||||||
@@ -89,6 +90,15 @@ models:
|
|||||||
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||||
unlisted: true
|
unlisted: true
|
||||||
|
|
||||||
|
# Docker Support (v26.1.4+ required!)
|
||||||
|
"docker-llama":
|
||||||
|
proxy: "http://127.0.0.1:9790"
|
||||||
|
cmd: >
|
||||||
|
docker run --name dockertest
|
||||||
|
--init --rm -p 9790:8080 -v /mnt/nvme/models:/models
|
||||||
|
ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
|
|
||||||
# profiles make it easy to managing multi model (and gpu) configurations.
|
# profiles make it easy to managing multi model (and gpu) configurations.
|
||||||
#
|
#
|
||||||
# Tips:
|
# Tips:
|
||||||
|
|||||||
@@ -53,6 +53,14 @@ models:
|
|||||||
--ctx-size 8192
|
--ctx-size 8192
|
||||||
--reranking
|
--reranking
|
||||||
|
|
||||||
|
# Docker Support (v26.1.4+ required!)
|
||||||
|
"dockertest":
|
||||||
|
proxy: "http://127.0.0.1:9790"
|
||||||
|
cmd: >
|
||||||
|
docker run --name dockertest
|
||||||
|
--init --rm -p 9790:8080 -v /mnt/nvme/models:/models
|
||||||
|
ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
|
|
||||||
"simple":
|
"simple":
|
||||||
# example of setting environment variables
|
# example of setting environment variables
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import (
|
|||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/mostlygeek/llama-swap/proxy"
|
"github.com/mostlygeek/llama-swap/proxy"
|
||||||
@@ -39,6 +41,16 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
proxyManager := proxy.New(config)
|
proxyManager := proxy.New(config)
|
||||||
|
|
||||||
|
sigChan := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
go func() {
|
||||||
|
<-sigChan
|
||||||
|
fmt.Println("Shutting down llama-swap")
|
||||||
|
proxyManager.StopProcesses()
|
||||||
|
os.Exit(0)
|
||||||
|
}()
|
||||||
|
|
||||||
fmt.Println("llama-swap listening on " + *listenStr)
|
fmt.Println("llama-swap listening on " + *listenStr)
|
||||||
if err := proxyManager.Run(*listenStr); err != nil {
|
if err := proxyManager.Run(*listenStr); err != nil {
|
||||||
fmt.Printf("Server error: %v\n", err)
|
fmt.Printf("Server error: %v\n", err)
|
||||||
|
|||||||
+4
-5
@@ -153,19 +153,17 @@ func (p *Process) Stop() {
|
|||||||
defer p.stateMutex.Unlock()
|
defer p.stateMutex.Unlock()
|
||||||
|
|
||||||
if p.state != StateReady {
|
if p.state != StateReady {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! Info - Stop() called but Process State is not READY\n")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.cmd == nil || p.cmd.Process == nil {
|
if p.cmd == nil || p.cmd.Process == nil {
|
||||||
// this situation should never happen... but if it does just update the state
|
// this situation should never happen... but if it does just update the state
|
||||||
fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
|
fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
|
||||||
p.state = StateStopped
|
p.state = StateStopped
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pretty sure this stopping code needs some work for windows and
|
|
||||||
// will be a source of pain in the future.
|
|
||||||
|
|
||||||
sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
@@ -190,6 +188,7 @@ func (p *Process) Stop() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
p.state = StateStopped
|
p.state = StateStopped
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -296,7 +295,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
proxyTo := p.config.Proxy
|
proxyTo := p.config.Proxy
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
|
req, err := http.NewRequestWithContext(r.Context(), r.Method, proxyTo+r.URL.String(), r.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -69,6 +69,19 @@ func New(config *Config) *ProxyManager {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// see: https://github.com/mostlygeek/llama-swap/issues/42
|
||||||
|
// respond with permissive OPTIONS for any endpoint
|
||||||
|
pm.ginEngine.Use(func(c *gin.Context) {
|
||||||
|
if c.Request.Method == "OPTIONS" {
|
||||||
|
c.Header("Access-Control-Allow-Origin", "*")
|
||||||
|
c.Header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||||
|
c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization")
|
||||||
|
c.AbortWithStatus(204)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Next()
|
||||||
|
})
|
||||||
|
|
||||||
// Set up routes using the Gin engine
|
// Set up routes using the Gin engine
|
||||||
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
||||||
// Support legacy /v1/completions api, see issue #12
|
// Support legacy /v1/completions api, see issue #12
|
||||||
|
|||||||
Reference in New Issue
Block a user