create tag to release

Change versioning to use git commits counts instead of semver
- less work for me - more frequent releases
2024-12-14 10:07:20 -08:00 · 2024-12-14 09:53:13 -08:00 · 2024-12-09 19:14:49 -08:00 · 2024-12-09 19:08:03 -08:00 · 2024-12-08 21:34:16 -08:00 · 2024-12-08 21:26:22 -08:00
9 changed files with 174 additions and 52 deletions
@@ -2,8 +2,8 @@ name: goreleaser
 on:
  push:
-    tags:
+    branches:
-      - '*'
+      - main
 permissions:
  contents: write
@@ -20,14 +20,23 @@ jobs:
      -
        name: Set up Go
        uses: actions/setup-go@v5
      - name: Get commit count
        id: get_commit_count
        run: echo "COMMIT_COUNT=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
      - name: Create release tag
        run: |
          git config user.name github-actions
          git config user.email github-actions@github.com
          git tag -a v${{ steps.get_commit_count.outputs.COMMIT_COUNT }} -m "Release v${{ steps.get_commit_count.outputs.COMMIT_COUNT }}"
          git push origin v${{ steps.get_commit_count.outputs.COMMIT_COUNT }}
      -
        name: Run GoReleaser
        uses: goreleaser/goreleaser-action@v6
        with:
          # either 'goreleaser' (default) or 'goreleaser-pro'
          distribution: goreleaser
-          # 'latest', 'nightly', or a semver
+          version: latest
-          version: '~> v2'
+          args: release --clean --snapshot
          args: release --clean
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -2,6 +2,16 @@
 APP_NAME = llama-swap
 BUILD_DIR = build
 # Get the current Git hash
 GIT_HASH := $(shell git rev-parse --short HEAD)
 ifneq ($(shell git status --porcelain),)
    # There are untracked changes
    GIT_HASH := $(GIT_HASH)+
 endif
 # Get the build number from the commit count on the main branch
 COMMIT_COUNT := $(shell git rev-list --count HEAD)
 # Default target: Builds binaries for both OSX and Linux
 all: mac linux simple-responder
@@ -18,12 +28,12 @@ test-all:
 # Build OSX binary
 mac:
 	@echo "Building Mac binary..."
-	GOOS=darwin GOARCH=arm64 go build -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
+	GOOS=darwin GOARCH=arm64 go build -ldflags="-X main.GIT_HASH=${GIT_HASH} -X main.COMMIT_COUNT=${COMMIT_COUNT}" -o $(BUILD_DIR)/$(APP_NAME)-darwin-arm64
 # Build Linux binary
 linux:
 	@echo "Building Linux binary..."
-	GOOS=linux GOARCH=amd64 go build -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
+	GOOS=linux GOARCH=amd64 go build -ldflags="-X main.GIT_HASH=${GIT_HASH} -X main.COMMIT_COUNT=${COMMIT_COUNT}" -o $(BUILD_DIR)/$(APP_NAME)-linux-amd64
 # for testing proxy.Process
 simple-responder:
@@ -2,19 +2,32 @@
 ![llama-swap header image](header.jpeg)
-llama-swap is a golang server that automatically swaps the llama.cpp server on demand. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
+# Introduction
 llama-swap is an OpenAI API compatible server that gives you complete control over how you use your hardware. It automatically swaps to the configuration of your choice for serving a model. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
 Features:
 - ✅ Easy to deploy: single binary with no dependencies
 - ✅ Single yaml configuration file
- ✅ Automatic switching between models
+- ✅ On-demand model switching
- ✅ Full control over llama.cpp server settings per model
+- ✅ Full control over server settings per model
 - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
 - ✅ Multiple GPU support
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
 - ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
 ## Releases
 Builds for Linux and OSX are available on the [Releases](https://github.com/mostlygeek/llama-swap/releases) page.
 ### Building from source
 1. Install golang for your system
 1. `git clone git@github.com:mostlygeek/llama-swap.git`
 1. `make clean all`
 1. Binaries will be in `build/` subdirectory
 ## config.yaml
@@ -83,22 +96,22 @@ More [examples](examples/README.md) are available for different use cases.
 ## Monitoring Logs
-The `/logs` endpoint is available to monitor what llama-swap is doing. It will send the last 10KB of logs. Useful for monitoring the output of llama-server. It also supports streaming of logs.
+Open the `http://<host>/logs` with your browser to get a web interface with streaming logs.
-Usage:
+Of course, CLI access is also supported:
 ```
 # sends up to the last 10KB of logs
 curl http://host/logs'
-# streams logs using chunk encoding
+# streams logs
 curl -Ns 'http://host/logs/stream'
 # stream and filter logs with linux pipes
 curl -Ns http://host/logs/stream | grep 'eval time'
 # skips history and just streams new log entries
 curl -Ns 'http://host/logs/stream?no-history'
 # streams logs using Server Sent Events
 curl -Ns 'http://host/logs/streamSSE'
 ```
 ## Systemd Unit Files
@@ -125,9 +138,3 @@ StartLimitInterval=30
 [Install]
 WantedBy=multi-user.target
 ```
 ## Building from Source
 1. Install golang for your system
 1. run `make clean all`
 1. binaries will be built into `build/` directory
@@ -22,22 +22,19 @@ for model in "$@"; do
    echo -n "$model,"
    for lang in "python" "typescript" "swift"; do
-        response=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"temperature\": 0.1, \"model\":\"$model\"}")
+        # expects a llama.cpp after PR https://github.com/ggerganov/llama.cpp/pull/10548
        # (Dec 3rd/2024)
        time=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"top_k\": 1, \"timings_per_token\":true, \"model\":\"$model\"}" | jq -r .timings.predicted_per_second)
        if [ $? -ne 0 ]; then
            time="error"
-        else
+            exit 1
            time=$(curl -s --url "$url/logs" | grep -oE '\d+(?:\.\d+)? tokens per second' | awk '{print $1}' | tail -n 1)
            if [ $? -ne 0 ]; then
                time="error"
            fi
        fi
        if [ "$lang" != "swift" ]; then
-            echo -n "$time,"
+            printf "%0.2f tps," $time
        else
-            echo -n "$time"
+            printf "%0.2f tps\n" $time
        fi
    done
-
+done
    echo ""
 done
@@ -9,13 +9,23 @@ import (
 	"github.com/mostlygeek/llama-swap/proxy"
 )
 // see Makefile which injects new values at build time
 var GIT_HASH string = "abcd1234"
 var COMMIT_COUNT string = "0-dev"
 func main() {
 	// Define a command-line flag for the port
 	configPath := flag.String("config", "config.yaml", "config file name")
 	listenStr := flag.String("listen", ":8080", "listen ip/port")
 	showVersion := flag.Bool("version", false, "show version of build")
 	flag.Parse() // Parse the command-line flags
 	if *showVersion {
 		fmt.Printf("version: v%s (%s)\n", COMMIT_COUNT, GIT_HASH)
 		os.Exit(0)
 	}
 	config, err := proxy.LoadConfig(*configPath)
 	if err != nil {
 		fmt.Printf("Error loading config: %v\n", err)
@@ -0,0 +1,53 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Logs</title>
    <style>
        body {
            margin: 0;
            height: 100vh;
            display: flex;
            flex-direction: column;
            font-family: "Courier New", Courier, monospace;
        }
        #log-stream {
            flex: 1;
            margin: 1em;
            padding: 10px;
            background: #f4f4f4;
            overflow-y: auto;
            white-space: pre-wrap; /* Ensures line wrapping */
            word-wrap: break-word; /* Ensures long words wrap */
        }
    </style>
 </head>
 <body>
    <pre id="log-stream">Waiting for logs...
 </pre>
    <script>
        // Establish an EventSource connection to the SSE endpoint
        if (typeof(EventSource) !== "undefined") {
            const eventSource = new EventSource("/logs/streamSSE");
            eventSource.onmessage = function(event) {
                // Append the new log message to the <pre> element
                const logStream = document.getElementById('log-stream');
                logStream.textContent += event.data;
                // Auto-scroll to the bottom
                logStream.scrollTop = logStream.scrollHeight;
            };
            eventSource.onerror = function(err) {
                console.error("EventSource failed:", err);
            };
        } else {
            console.error("SSE not supported by this browser.");
        }
    </script>
 </body>
 </html>
@@ -122,16 +122,15 @@ func (p *Process) start() error {
 		// start a goroutine to check every second if
 		// the process should be stopped
 		go func() {
 			ticker := time.NewTicker(time.Second)
 			defer ticker.Stop()
 			maxDuration := time.Duration(p.config.UnloadAfter) * time.Second
-			for {
+			for range time.Tick(time.Second) {
-				<-ticker.C
+				// wait for all inflight requests to complete and ticker
 				p.inFlightRequests.Wait()
 				if time.Since(p.lastRequestHandled) > maxDuration {
 					fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %d reached.\n", p.ID, p.config.UnloadAfter)
 					p.Stop()
 					return
 				}
 			}
 		}()
@@ -275,7 +274,11 @@ func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error {
 func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 	p.inFlightRequests.Add(1)
-	defer p.inFlightRequests.Done()
+
 	defer func() {
 		p.lastRequestHandled = time.Now()
 		p.inFlightRequests.Done()
 	}()
 	if p.CurrentState() != StateReady {
 		if err := p.start(); err != nil {
@@ -285,8 +288,6 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 		}
 	}
 	p.lastRequestHandled = time.Now()
 	proxyTo := p.config.Proxy
 	client := &http.Client{}
 	req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
@@ -82,18 +82,31 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
 	process := NewProcess("ttl", 2, config, NewLogMonitorWriter(io.Discard))
 	defer process.Stop()
-	req := httptest.NewRequest("GET", "/test", nil)
+	// this should take 4 seconds
 	req1 := httptest.NewRequest("GET", "/slow-respond?echo=1234&delay=1000ms", nil)
 	req2 := httptest.NewRequest("GET", "/test", nil)
 	w := httptest.NewRecorder()
-	// Proxy the request (auto start)
+	// Proxy the request (auto start) with a slow response that takes longer than config.UnloadAfter
-	process.ProxyRequest(w, req)
+	process.ProxyRequest(w, req1)
 	t.Log("sending slow first request (4 seconds)")
 	assert.Equal(t, http.StatusOK, w.Code, "Expected status code %d, got %d", http.StatusOK, w.Code)
 	assert.Contains(t, w.Body.String(), "1234")
 	assert.Equal(t, StateReady, process.CurrentState())
 	// ensure the TTL timeout does not race slow requests (see issue #25)
 	t.Log("sending second request (1 second)")
 	time.Sleep(time.Second)
 	w = httptest.NewRecorder()
 	process.ProxyRequest(w, req2)
 	assert.Equal(t, http.StatusOK, w.Code, "Expected status code %d, got %d", http.StatusOK, w.Code)
 	assert.Contains(t, w.Body.String(), expectedMessage)
 	assert.Equal(t, StateReady, process.CurrentState())
 	// wait 5 seconds
 	t.Log("sleep 5 seconds and check if unloaded")
 	time.Sleep(5 * time.Second)
 	assert.Equal(t, StateStopped, process.CurrentState())
 }
@@ -1,19 +1,41 @@
 package proxy
 import (
 	"embed"
 	"fmt"
 	"net/http"
 	"strings"
 	"github.com/gin-gonic/gin"
 )
 //go:embed html/logs.html
 var logsHTML []byte
 // make sure embed is kept there by the IDE auto-package importer
 var _ = embed.FS{}
 func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
-	c.Header("Content-Type", "text/plain")
+
-	history := pm.logMonitor.GetHistory()
+	accept := c.GetHeader("Accept")
-	_, err := c.Writer.Write(history)
+	if strings.Contains(accept, "text/html") {
-	if err != nil {
+		// Set the Content-Type header to text/html
-		c.AbortWithError(http.StatusInternalServerError, err)
+		c.Header("Content-Type", "text/html")
-		return
+
 		// Write the embedded HTML content to the response
 		_, err := c.Writer.Write(logsHTML)
 		if err != nil {
 			c.AbortWithError(http.StatusInternalServerError, fmt.Errorf("failed to write response: %v", err))
 			return
 		}
 	} else {
 		c.Header("Content-Type", "text/plain")
 		history := pm.logMonitor.GetHistory()
 		_, err := c.Writer.Write(history)
 		if err != nil {
 			c.AbortWithError(http.StatusInternalServerError, err)
 			return
 		}
 	}
 }
Author	SHA1	Message	Date
Benson Wong	a955a4a5c0	create tag to release	2024-12-14 10:07:20 -08:00
Benson Wong	22d3f1a4f9	Change versioning to use git commits counts instead of semver - less work for me - more frequent releases	2024-12-14 09:53:13 -08:00
Benson Wong	e2443251ad	update readme	2024-12-09 19:14:49 -08:00
Benson Wong	5fbd53c616	delay TTL check until after all requests are complete (#25 ) - fixes #25 where requests that last longer than the TTL will cause the process to be unloaded before the next request. - new behavior, TTL waits until all requests are complete before checking timeout	2024-12-09 19:08:03 -08:00
Benson Wong	97dae50dc4	update readme	2024-12-08 21:34:16 -08:00
Benson Wong	cb978f760f	add web interface to /logs	2024-12-08 21:26:22 -08:00
Benson Wong	387f0ef6c4	use new timings data in server response in run-benchmark.sh	2024-12-03 20:48:36 -08:00