remove cmd_stop configuration and functionality from PR #40 (#44 )

* remove cmd_stop functionality from #40
Use client request context in proxy request (#43 )
2025-01-31 12:42:44 -08:00 · 2025-01-31 10:21:49 -08:00 · 2025-01-31 10:09:07 -08:00 · 2025-01-30 16:59:57 -08:00
5 changed files with 48 additions and 6 deletions
@@ -30,6 +30,7 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
  - `v1/rerank`
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
 - ✅ Multiple GPU support
 - ✅ Docker and Podman support
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
@@ -89,6 +90,15 @@ models:
    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
    unlisted: true
  # Docker Support (v26.1.4+ required!)
  "docker-llama":
    proxy: "http://127.0.0.1:9790"
    cmd: >
      docker run --name dockertest
      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
@@ -53,6 +53,14 @@ models:
      --ctx-size 8192
      --reranking
  # Docker Support (v26.1.4+ required!)
  "dockertest":
    proxy: "http://127.0.0.1:9790"
    cmd: >
      docker run --name dockertest
      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
  "simple":
    # example of setting environment variables
@@ -4,6 +4,8 @@ import (
 	"flag"
 	"fmt"
 	"os"
 	"os/signal"
 	"syscall"
 	"github.com/gin-gonic/gin"
 	"github.com/mostlygeek/llama-swap/proxy"
@@ -39,6 +41,16 @@ func main() {
 	}
 	proxyManager := proxy.New(config)
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-sigChan
 		fmt.Println("Shutting down llama-swap")
 		proxyManager.StopProcesses()
 		os.Exit(0)
 	}()
 	fmt.Println("llama-swap listening on " + *listenStr)
 	if err := proxyManager.Run(*listenStr); err != nil {
 		fmt.Printf("Server error: %v\n", err)
@@ -153,19 +153,17 @@ func (p *Process) Stop() {
 	defer p.stateMutex.Unlock()
 	if p.state != StateReady {
 		fmt.Fprintf(p.logMonitor, "!!! Info - Stop() called but Process State is not READY\n")
 		return
 	}
 	if p.cmd == nil || p.cmd.Process == nil {
 		// this situation should never happen... but if it does just update the state
-		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
+		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
 		p.state = StateStopped
 		return
 	}
 	// Pretty sure this stopping code needs some work for windows and
 	// will be a source of pain in the future.
 	sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()
@@ -190,6 +188,7 @@ func (p *Process) Stop() {
 			}
 		}
 	}
 	p.state = StateStopped
 }
@@ -296,7 +295,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 	proxyTo := p.config.Proxy
 	client := &http.Client{}
-	req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
+	req, err := http.NewRequestWithContext(r.Context(), r.Method, proxyTo+r.URL.String(), r.Body)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
@@ -69,6 +69,19 @@ func New(config *Config) *ProxyManager {
 		})
 	}
 	// see: https://github.com/mostlygeek/llama-swap/issues/42
 	// respond with permissive OPTIONS for any endpoint
 	pm.ginEngine.Use(func(c *gin.Context) {
 		if c.Request.Method == "OPTIONS" {
 			c.Header("Access-Control-Allow-Origin", "*")
 			c.Header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
 			c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization")
 			c.AbortWithStatus(204)
 			return
 		}
 		c.Next()
 	})
 	// Set up routes using the Gin engine
 	pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
 	// Support legacy /v1/completions api, see issue #12
Author	SHA1	Message	Date
Benson Wong	314d2f2212	remove cmd_stop configuration and functionality from PR #40 (#44 ) * remove cmd_stop functionality from #40	2025-01-31 12:42:44 -08:00
Benson Wong	fad25f3e11	Use client request context in proxy request (#43 ) Canceled or closed HTTP requests from clients will also stop the proxied HTTP requests to upstreamed servers.	2025-01-31 10:21:49 -08:00
Benson Wong	2c3e3e27f7	Support OPTIONS requests (#42 ) Add middleware that responds with permissive OPTIONS headers for all request paths.	2025-01-31 10:09:07 -08:00
Benson Wong	baeb0c4e7f	Add cmd_stop configuration to better support docker (#35 ) Add `cmd_stop` to model configuration to run a command instead of sending a SIGTERM to shutdown a process before swapping.	2025-01-30 16:59:57 -08:00