remove cmd_stop configuration and functionality from PR #40 (#44 )

* remove cmd_stop functionality from #40
Use client request context in proxy request (#43 )
2025-01-31 12:42:44 -08:00 · 2025-01-31 10:21:49 -08:00 · 2025-01-31 10:09:07 -08:00 · 2025-01-30 16:59:57 -08:00 · 2025-01-20 14:39:52 -08:00
6 changed files with 80 additions and 10 deletions
@@ -30,6 +30,7 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
  - `v1/rerank`
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
 - ✅ Multiple GPU support
 - ✅ Docker and Podman support
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
@@ -89,6 +90,15 @@ models:
    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
    unlisted: true
  # Docker Support (v26.1.4+ required!)
  "docker-llama":
    proxy: "http://127.0.0.1:9790"
    cmd: >
      docker run --name dockertest
      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
@@ -53,6 +53,14 @@ models:
      --ctx-size 8192
      --reranking
  # Docker Support (v26.1.4+ required!)
  "dockertest":
    proxy: "http://127.0.0.1:9790"
    cmd: >
      docker run --name dockertest
      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
  "simple":
    # example of setting environment variables
@@ -4,6 +4,8 @@ import (
 	"flag"
 	"fmt"
 	"os"
 	"os/signal"
 	"syscall"
 	"github.com/gin-gonic/gin"
 	"github.com/mostlygeek/llama-swap/proxy"
@@ -39,6 +41,16 @@ func main() {
 	}
 	proxyManager := proxy.New(config)
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-sigChan
 		fmt.Println("Shutting down llama-swap")
 		proxyManager.StopProcesses()
 		os.Exit(0)
 	}()
 	fmt.Println("llama-swap listening on " + *listenStr)
 	if err := proxyManager.Run(*listenStr); err != nil {
 		fmt.Printf("Server error: %v\n", err)
@@ -135,6 +135,7 @@ func (p *Process) start() error {
 				if time.Since(p.lastRequestHandled) > maxDuration {
 					fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
 					p.Stop()
 					return
 				}
 			}
 		}()
@@ -152,20 +153,17 @@ func (p *Process) Stop() {
 	defer p.stateMutex.Unlock()
 	if p.state != StateReady {
 		fmt.Fprintf(p.logMonitor, "!!! Info - Stop() called but Process State is not READY\n")
 		return
 	}
 	if p.cmd == nil || p.cmd.Process == nil {
 		// this situation should never happen... but if it does just update the state
-		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
+		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
 		p.state = StateStopped
 		return
 	}
 	// Pretty sure this stopping code needs some work for windows and
 	// will be a source of pain in the future.
 	p.cmd.Process.Signal(syscall.SIGTERM)
 	sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
@@ -174,9 +172,11 @@ func (p *Process) Stop() {
 		sigtermNormal <- p.cmd.Wait()
 	}()
 	p.cmd.Process.Signal(syscall.SIGTERM)
 	select {
 	case <-sigtermTimeout.Done():
-		fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
+		fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
 		p.cmd.Process.Kill()
 		p.cmd.Wait()
 	case err := <-sigtermNormal:
@@ -188,6 +188,7 @@ func (p *Process) Stop() {
 			}
 		}
 	}
 	p.state = StateStopped
 }
@@ -294,7 +295,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 	proxyTo := p.config.Proxy
 	client := &http.Client{}
-	req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
+	req, err := http.NewRequestWithContext(r.Context(), r.Method, proxyTo+r.URL.String(), r.Body)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -67,7 +67,6 @@ func TestProcess_BrokenModelConfig(t *testing.T) {
 	assert.Contains(t, w.Body.String(), "unable to start process")
 }
 // test that the process unloads after the TTL
 func TestProcess_UnloadAfterTTL(t *testing.T) {
 	if testing.Short() {
 		t.Skip("skipping long auto unload TTL test")
@@ -79,7 +78,7 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
 	config.UnloadAfter = 3 // seconds
 	assert.Equal(t, 3, config.UnloadAfter)
-	process := NewProcess("ttl", 2, config, NewLogMonitorWriter(io.Discard))
+	process := NewProcess("ttl_test", 2, config, NewLogMonitorWriter(io.Discard))
 	defer process.Stop()
 	// this should take 4 seconds
@@ -111,6 +110,33 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
 	assert.Equal(t, StateStopped, process.CurrentState())
 }
 func TestProcess_LowTTLValue(t *testing.T) {
 	if true { // change this code to run this ...
 		t.Skip("skipping test, edit process_test.go to run it ")
 	}
 	config := getTestSimpleResponderConfig("fast_ttl")
 	assert.Equal(t, 0, config.UnloadAfter)
 	config.UnloadAfter = 1 // second
 	assert.Equal(t, 1, config.UnloadAfter)
 	process := NewProcess("ttl", 2, config, NewLogMonitorWriter(os.Stdout))
 	defer process.Stop()
 	for i := 0; i < 100; i++ {
 		t.Logf("Waiting before sending request %d", i)
 		time.Sleep(1500 * time.Millisecond)
 		expected := fmt.Sprintf("echo=test_%d", i)
 		req := httptest.NewRequest("GET", fmt.Sprintf("/slow-respond?echo=%s&delay=50ms", expected), nil)
 		w := httptest.NewRecorder()
 		process.ProxyRequest(w, req)
 		assert.Equal(t, http.StatusOK, w.Code)
 		assert.Contains(t, w.Body.String(), expected)
 	}
 }
 // issue #19
 func TestProcess_HTTPRequestsHaveTimeToFinish(t *testing.T) {
 	if testing.Short() {
@@ -69,6 +69,19 @@ func New(config *Config) *ProxyManager {
 		})
 	}
 	// see: https://github.com/mostlygeek/llama-swap/issues/42
 	// respond with permissive OPTIONS for any endpoint
 	pm.ginEngine.Use(func(c *gin.Context) {
 		if c.Request.Method == "OPTIONS" {
 			c.Header("Access-Control-Allow-Origin", "*")
 			c.Header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
 			c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization")
 			c.AbortWithStatus(204)
 			return
 		}
 		c.Next()
 	})
 	// Set up routes using the Gin engine
 	pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
 	// Support legacy /v1/completions api, see issue #12
Author	SHA1	Message	Date
Benson Wong	314d2f2212	remove cmd_stop configuration and functionality from PR #40 (#44 ) * remove cmd_stop functionality from #40	2025-01-31 12:42:44 -08:00
Benson Wong	fad25f3e11	Use client request context in proxy request (#43 ) Canceled or closed HTTP requests from clients will also stop the proxied HTTP requests to upstreamed servers.	2025-01-31 10:21:49 -08:00
Benson Wong	2c3e3e27f7	Support OPTIONS requests (#42 ) Add middleware that responds with permissive OPTIONS headers for all request paths.	2025-01-31 10:09:07 -08:00
Benson Wong	baeb0c4e7f	Add cmd_stop configuration to better support docker (#35 ) Add `cmd_stop` to model configuration to run a command instead of sending a SIGTERM to shutdown a process before swapping.	2025-01-30 16:59:57 -08:00
Benson Wong	2833517eef	Improve handling of process that do not handle SIGTERM (#38 ) - Process TTL goroutine did not have a return after .Stop() - Improve logging - Add test TestProcess_LowTTLValue to measure SIGTERM error rate	2025-01-20 14:39:52 -08:00