remove cmd_stop configuration and functionality from PR #40 (#44 )

* remove cmd_stop functionality from #40
Use client request context in proxy request (#43 )
2025-01-31 12:42:44 -08:00 · 2025-01-31 10:21:49 -08:00 · 2025-01-31 10:09:07 -08:00 · 2025-01-30 16:59:57 -08:00 · 2025-01-20 14:39:52 -08:00
6 changed files with 80 additions and 10 deletions
@@ -5,7 +5,7 @@
 # Introduction
 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.

-Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). 
+Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file).

 Download a pre-built [release](https://github.com/mostlygeek/llama-swap/releases) or build it yourself from source with `make clean all`.

@@ -30,6 +30,7 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
  - `v1/rerank`
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
 - ✅ Multiple GPU support
+- ✅ Docker and Podman support
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
@@ -89,6 +90,15 @@ models:
    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
    unlisted: true

+  # Docker Support (v26.1.4+ required!)
+  "docker-llama":
+    proxy: "http://127.0.0.1:9790"
+    cmd: >
+      docker run --name dockertest
+      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
+      ghcr.io/ggerganov/llama.cpp:server
+      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
+
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
@@ -53,6 +53,14 @@ models:
      --ctx-size 8192
      --reranking

+  # Docker Support (v26.1.4+ required!)
+  "dockertest":
+    proxy: "http://127.0.0.1:9790"
+    cmd: >
+      docker run --name dockertest
+      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
+      ghcr.io/ggerganov/llama.cpp:server
+      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

  "simple":
    # example of setting environment variables
@@ -4,6 +4,8 @@ import (
 	"flag"
 	"fmt"
 	"os"
+	"os/signal"
+	"syscall"

 	"github.com/gin-gonic/gin"
 	"github.com/mostlygeek/llama-swap/proxy"
@@ -39,6 +41,16 @@ func main() {
 	}

 	proxyManager := proxy.New(config)
+
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	go func() {
+		<-sigChan
+		fmt.Println("Shutting down llama-swap")
+		proxyManager.StopProcesses()
+		os.Exit(0)
+	}()
+
 	fmt.Println("llama-swap listening on " + *listenStr)
 	if err := proxyManager.Run(*listenStr); err != nil {
 		fmt.Printf("Server error: %v\n", err)
@@ -135,6 +135,7 @@ func (p *Process) start() error {
 				if time.Since(p.lastRequestHandled) > maxDuration {
 					fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
 					p.Stop()
+					return
 				}
 			}
 		}()
@@ -152,20 +153,17 @@ func (p *Process) Stop() {
 	defer p.stateMutex.Unlock()

 	if p.state != StateReady {
+		fmt.Fprintf(p.logMonitor, "!!! Info - Stop() called but Process State is not READY\n")
 		return
 	}

 	if p.cmd == nil || p.cmd.Process == nil {
 		// this situation should never happen... but if it does just update the state
-		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
+		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
 		p.state = StateStopped
 		return
 	}

-	// Pretty sure this stopping code needs some work for windows and
-	// will be a source of pain in the future.
-
-	p.cmd.Process.Signal(syscall.SIGTERM)
 	sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()

@@ -174,9 +172,11 @@ func (p *Process) Stop() {
 		sigtermNormal <- p.cmd.Wait()
 	}()

+	p.cmd.Process.Signal(syscall.SIGTERM)
+
 	select {
 	case <-sigtermTimeout.Done():
-		fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
+		fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
 		p.cmd.Process.Kill()
 		p.cmd.Wait()
 	case err := <-sigtermNormal:
@@ -188,6 +188,7 @@ func (p *Process) Stop() {
 			}
 		}
 	}
+
 	p.state = StateStopped
 }

@@ -294,7 +295,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {

 	proxyTo := p.config.Proxy
 	client := &http.Client{}
-	req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
+	req, err := http.NewRequestWithContext(r.Context(), r.Method, proxyTo+r.URL.String(), r.Body)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -67,7 +67,6 @@ func TestProcess_BrokenModelConfig(t *testing.T) {
 	assert.Contains(t, w.Body.String(), "unable to start process")
 }

-// test that the process unloads after the TTL
 func TestProcess_UnloadAfterTTL(t *testing.T) {
 	if testing.Short() {
 		t.Skip("skipping long auto unload TTL test")
@@ -79,7 +78,7 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
 	config.UnloadAfter = 3 // seconds
 	assert.Equal(t, 3, config.UnloadAfter)

-	process := NewProcess("ttl", 2, config, NewLogMonitorWriter(io.Discard))
+	process := NewProcess("ttl_test", 2, config, NewLogMonitorWriter(io.Discard))
 	defer process.Stop()

 	// this should take 4 seconds
@@ -111,6 +110,33 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
 	assert.Equal(t, StateStopped, process.CurrentState())
 }

+func TestProcess_LowTTLValue(t *testing.T) {
+	if true { // change this code to run this ...
+		t.Skip("skipping test, edit process_test.go to run it ")
+	}
+
+	config := getTestSimpleResponderConfig("fast_ttl")
+	assert.Equal(t, 0, config.UnloadAfter)
+	config.UnloadAfter = 1 // second
+	assert.Equal(t, 1, config.UnloadAfter)
+
+	process := NewProcess("ttl", 2, config, NewLogMonitorWriter(os.Stdout))
+	defer process.Stop()
+
+	for i := 0; i < 100; i++ {
+		t.Logf("Waiting before sending request %d", i)
+		time.Sleep(1500 * time.Millisecond)
+
+		expected := fmt.Sprintf("echo=test_%d", i)
+		req := httptest.NewRequest("GET", fmt.Sprintf("/slow-respond?echo=%s&delay=50ms", expected), nil)
+		w := httptest.NewRecorder()
+		process.ProxyRequest(w, req)
+		assert.Equal(t, http.StatusOK, w.Code)
+		assert.Contains(t, w.Body.String(), expected)
+	}
+
+}
+
 // issue #19
 func TestProcess_HTTPRequestsHaveTimeToFinish(t *testing.T) {
 	if testing.Short() {
@@ -69,6 +69,19 @@ func New(config *Config) *ProxyManager {
 		})
 	}

+	// see: https://github.com/mostlygeek/llama-swap/issues/42
+	// respond with permissive OPTIONS for any endpoint
+	pm.ginEngine.Use(func(c *gin.Context) {
+		if c.Request.Method == "OPTIONS" {
+			c.Header("Access-Control-Allow-Origin", "*")
+			c.Header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
+			c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization")
+			c.AbortWithStatus(204)
+			return
+		}
+		c.Next()
+	})
+
 	// Set up routes using the Gin engine
 	pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
 	// Support legacy /v1/completions api, see issue #12
Author	SHA1	Message	Date
Benson Wong	314d2f2212	remove cmd_stop configuration and functionality from PR #40 (#44 ) * remove cmd_stop functionality from #40	2025-01-31 12:42:44 -08:00
Benson Wong	fad25f3e11	Use client request context in proxy request (#43 ) Canceled or closed HTTP requests from clients will also stop the proxied HTTP requests to upstreamed servers.	2025-01-31 10:21:49 -08:00
Benson Wong	2c3e3e27f7	Support OPTIONS requests (#42 ) Add middleware that responds with permissive OPTIONS headers for all request paths.	2025-01-31 10:09:07 -08:00
Benson Wong	baeb0c4e7f	Add cmd_stop configuration to better support docker (#35 ) Add `cmd_stop` to model configuration to run a command instead of sending a SIGTERM to shutdown a process before swapping.	2025-01-30 16:59:57 -08:00
Benson Wong	2833517eef	Improve handling of process that do not handle SIGTERM (#38 ) - Process TTL goroutine did not have a return after .Stop() - Improve logging - Add test TestProcess_LowTTLValue to measure SIGTERM error rate	2025-01-20 14:39:52 -08:00