Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 314d2f2212 | |||
| fad25f3e11 | |||
| 2c3e3e27f7 | |||
| baeb0c4e7f | |||
| 2833517eef |
@@ -30,6 +30,7 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
|
|||||||
- `v1/rerank`
|
- `v1/rerank`
|
||||||
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
||||||
- ✅ Multiple GPU support
|
- ✅ Multiple GPU support
|
||||||
|
- ✅ Docker and Podman support
|
||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
- ✅ Automatic unloading of models from GPUs after timeout
|
- ✅ Automatic unloading of models from GPUs after timeout
|
||||||
@@ -89,6 +90,15 @@ models:
|
|||||||
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||||
unlisted: true
|
unlisted: true
|
||||||
|
|
||||||
|
# Docker Support (v26.1.4+ required!)
|
||||||
|
"docker-llama":
|
||||||
|
proxy: "http://127.0.0.1:9790"
|
||||||
|
cmd: >
|
||||||
|
docker run --name dockertest
|
||||||
|
--init --rm -p 9790:8080 -v /mnt/nvme/models:/models
|
||||||
|
ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
|
|
||||||
# profiles make it easy to managing multi model (and gpu) configurations.
|
# profiles make it easy to managing multi model (and gpu) configurations.
|
||||||
#
|
#
|
||||||
# Tips:
|
# Tips:
|
||||||
|
|||||||
@@ -53,6 +53,14 @@ models:
|
|||||||
--ctx-size 8192
|
--ctx-size 8192
|
||||||
--reranking
|
--reranking
|
||||||
|
|
||||||
|
# Docker Support (v26.1.4+ required!)
|
||||||
|
"dockertest":
|
||||||
|
proxy: "http://127.0.0.1:9790"
|
||||||
|
cmd: >
|
||||||
|
docker run --name dockertest
|
||||||
|
--init --rm -p 9790:8080 -v /mnt/nvme/models:/models
|
||||||
|
ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
|
|
||||||
"simple":
|
"simple":
|
||||||
# example of setting environment variables
|
# example of setting environment variables
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import (
|
|||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/mostlygeek/llama-swap/proxy"
|
"github.com/mostlygeek/llama-swap/proxy"
|
||||||
@@ -39,6 +41,16 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
proxyManager := proxy.New(config)
|
proxyManager := proxy.New(config)
|
||||||
|
|
||||||
|
sigChan := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
go func() {
|
||||||
|
<-sigChan
|
||||||
|
fmt.Println("Shutting down llama-swap")
|
||||||
|
proxyManager.StopProcesses()
|
||||||
|
os.Exit(0)
|
||||||
|
}()
|
||||||
|
|
||||||
fmt.Println("llama-swap listening on " + *listenStr)
|
fmt.Println("llama-swap listening on " + *listenStr)
|
||||||
if err := proxyManager.Run(*listenStr); err != nil {
|
if err := proxyManager.Run(*listenStr); err != nil {
|
||||||
fmt.Printf("Server error: %v\n", err)
|
fmt.Printf("Server error: %v\n", err)
|
||||||
|
|||||||
+8
-7
@@ -135,6 +135,7 @@ func (p *Process) start() error {
|
|||||||
if time.Since(p.lastRequestHandled) > maxDuration {
|
if time.Since(p.lastRequestHandled) > maxDuration {
|
||||||
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
|
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
|
||||||
p.Stop()
|
p.Stop()
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@@ -152,20 +153,17 @@ func (p *Process) Stop() {
|
|||||||
defer p.stateMutex.Unlock()
|
defer p.stateMutex.Unlock()
|
||||||
|
|
||||||
if p.state != StateReady {
|
if p.state != StateReady {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! Info - Stop() called but Process State is not READY\n")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.cmd == nil || p.cmd.Process == nil {
|
if p.cmd == nil || p.cmd.Process == nil {
|
||||||
// this situation should never happen... but if it does just update the state
|
// this situation should never happen... but if it does just update the state
|
||||||
fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
|
fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
|
||||||
p.state = StateStopped
|
p.state = StateStopped
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pretty sure this stopping code needs some work for windows and
|
|
||||||
// will be a source of pain in the future.
|
|
||||||
|
|
||||||
p.cmd.Process.Signal(syscall.SIGTERM)
|
|
||||||
sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
@@ -174,9 +172,11 @@ func (p *Process) Stop() {
|
|||||||
sigtermNormal <- p.cmd.Wait()
|
sigtermNormal <- p.cmd.Wait()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
p.cmd.Process.Signal(syscall.SIGTERM)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-sigtermTimeout.Done():
|
case <-sigtermTimeout.Done():
|
||||||
fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
|
fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
|
||||||
p.cmd.Process.Kill()
|
p.cmd.Process.Kill()
|
||||||
p.cmd.Wait()
|
p.cmd.Wait()
|
||||||
case err := <-sigtermNormal:
|
case err := <-sigtermNormal:
|
||||||
@@ -188,6 +188,7 @@ func (p *Process) Stop() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
p.state = StateStopped
|
p.state = StateStopped
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -294,7 +295,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
proxyTo := p.config.Proxy
|
proxyTo := p.config.Proxy
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
|
req, err := http.NewRequestWithContext(r.Context(), r.Method, proxyTo+r.URL.String(), r.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
|
|||||||
+28
-2
@@ -67,7 +67,6 @@ func TestProcess_BrokenModelConfig(t *testing.T) {
|
|||||||
assert.Contains(t, w.Body.String(), "unable to start process")
|
assert.Contains(t, w.Body.String(), "unable to start process")
|
||||||
}
|
}
|
||||||
|
|
||||||
// test that the process unloads after the TTL
|
|
||||||
func TestProcess_UnloadAfterTTL(t *testing.T) {
|
func TestProcess_UnloadAfterTTL(t *testing.T) {
|
||||||
if testing.Short() {
|
if testing.Short() {
|
||||||
t.Skip("skipping long auto unload TTL test")
|
t.Skip("skipping long auto unload TTL test")
|
||||||
@@ -79,7 +78,7 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
|
|||||||
config.UnloadAfter = 3 // seconds
|
config.UnloadAfter = 3 // seconds
|
||||||
assert.Equal(t, 3, config.UnloadAfter)
|
assert.Equal(t, 3, config.UnloadAfter)
|
||||||
|
|
||||||
process := NewProcess("ttl", 2, config, NewLogMonitorWriter(io.Discard))
|
process := NewProcess("ttl_test", 2, config, NewLogMonitorWriter(io.Discard))
|
||||||
defer process.Stop()
|
defer process.Stop()
|
||||||
|
|
||||||
// this should take 4 seconds
|
// this should take 4 seconds
|
||||||
@@ -111,6 +110,33 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
|
|||||||
assert.Equal(t, StateStopped, process.CurrentState())
|
assert.Equal(t, StateStopped, process.CurrentState())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestProcess_LowTTLValue(t *testing.T) {
|
||||||
|
if true { // change this code to run this ...
|
||||||
|
t.Skip("skipping test, edit process_test.go to run it ")
|
||||||
|
}
|
||||||
|
|
||||||
|
config := getTestSimpleResponderConfig("fast_ttl")
|
||||||
|
assert.Equal(t, 0, config.UnloadAfter)
|
||||||
|
config.UnloadAfter = 1 // second
|
||||||
|
assert.Equal(t, 1, config.UnloadAfter)
|
||||||
|
|
||||||
|
process := NewProcess("ttl", 2, config, NewLogMonitorWriter(os.Stdout))
|
||||||
|
defer process.Stop()
|
||||||
|
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
t.Logf("Waiting before sending request %d", i)
|
||||||
|
time.Sleep(1500 * time.Millisecond)
|
||||||
|
|
||||||
|
expected := fmt.Sprintf("echo=test_%d", i)
|
||||||
|
req := httptest.NewRequest("GET", fmt.Sprintf("/slow-respond?echo=%s&delay=50ms", expected), nil)
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
process.ProxyRequest(w, req)
|
||||||
|
assert.Equal(t, http.StatusOK, w.Code)
|
||||||
|
assert.Contains(t, w.Body.String(), expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// issue #19
|
// issue #19
|
||||||
func TestProcess_HTTPRequestsHaveTimeToFinish(t *testing.T) {
|
func TestProcess_HTTPRequestsHaveTimeToFinish(t *testing.T) {
|
||||||
if testing.Short() {
|
if testing.Short() {
|
||||||
|
|||||||
@@ -69,6 +69,19 @@ func New(config *Config) *ProxyManager {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// see: https://github.com/mostlygeek/llama-swap/issues/42
|
||||||
|
// respond with permissive OPTIONS for any endpoint
|
||||||
|
pm.ginEngine.Use(func(c *gin.Context) {
|
||||||
|
if c.Request.Method == "OPTIONS" {
|
||||||
|
c.Header("Access-Control-Allow-Origin", "*")
|
||||||
|
c.Header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||||
|
c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization")
|
||||||
|
c.AbortWithStatus(204)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Next()
|
||||||
|
})
|
||||||
|
|
||||||
// Set up routes using the Gin engine
|
// Set up routes using the Gin engine
|
||||||
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
||||||
// Support legacy /v1/completions api, see issue #12
|
// Support legacy /v1/completions api, see issue #12
|
||||||
|
|||||||
Reference in New Issue
Block a user