update readme

add web interface to /logs
use new timings data in server response in run-benchmark.sh
2024-12-08 21:34:16 -08:00 · 2024-12-08 21:26:22 -08:00 · 2024-12-03 20:48:36 -08:00
4 changed files with 110 additions and 32 deletions
@@ -2,20 +2,32 @@
 ![llama-swap header image](header.jpeg)
-llama-swap is a golang server that automatically swaps the llama.cpp server on demand. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
+# Introduction
 llama-swap is an OpenAI API compatible server that gives you complete control over how you use your hardware. It automatically swaps to the configuration of your choice for serving a model. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
 Features:
 - ✅ Easy to deploy: single binary with no dependencies
 - ✅ Single yaml configuration file
- ✅ Automatic switching between models
+- ✅ On-demand model switching
- ✅ Full control over llama.cpp server settings per model
+- ✅ Full control over server settings per model
 - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
 - ✅ Multiple GPU support
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
 ## Releases
 Builds for Linux and OSX are available on the [Releases](https://github.com/mostlygeek/llama-swap/releases) page.
 ### Building from source
 1. Install golang for your system
 1. `git clone git@github.com:mostlygeek/llama-swap.git`
 1. `make clean all`
 1. Binaries will be in `build/` subdirectory
 ## config.yaml
 llama-swap's configuration is purposefully simple.
@@ -83,22 +95,22 @@ More [examples](examples/README.md) are available for different use cases.
 ## Monitoring Logs
-The `/logs` endpoint is available to monitor what llama-swap is doing. It will send the last 10KB of logs. Useful for monitoring the output of llama-server. It also supports streaming of logs.
+Open the `http://<host>/logs` with your browser to get a web interface with streaming logs.
-Usage:
+Of course, CLI access is also supported:
 ```
 # sends up to the last 10KB of logs
 curl http://host/logs'
-# streams logs using chunk encoding
+# streams logs
 curl -Ns 'http://host/logs/stream'
 # stream and filter logs with linux pipes
 curl -Ns http://host/logs/stream | grep 'eval time'
 # skips history and just streams new log entries
 curl -Ns 'http://host/logs/stream?no-history'
 # streams logs using Server Sent Events
 curl -Ns 'http://host/logs/streamSSE'
 ```
 ## Systemd Unit Files
@@ -125,9 +137,3 @@ StartLimitInterval=30
 [Install]
 WantedBy=multi-user.target
 ```
 ## Building from Source
 1. Install golang for your system
 1. run `make clean all`
 1. binaries will be built into `build/` directory
@@ -22,22 +22,19 @@ for model in "$@"; do
    echo -n "$model,"
    for lang in "python" "typescript" "swift"; do
-        response=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"temperature\": 0.1, \"model\":\"$model\"}")
+        # expects a llama.cpp after PR https://github.com/ggerganov/llama.cpp/pull/10548
        # (Dec 3rd/2024)
        time=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"top_k\": 1, \"timings_per_token\":true, \"model\":\"$model\"}" | jq -r .timings.predicted_per_second)
        if [ $? -ne 0 ]; then
            time="error"
-        else
+            exit 1
            time=$(curl -s --url "$url/logs" | grep -oE '\d+(?:\.\d+)? tokens per second' | awk '{print $1}' | tail -n 1)
            if [ $? -ne 0 ]; then
                time="error"
            fi
        fi
        if [ "$lang" != "swift" ]; then
-            echo -n "$time,"
+            printf "%0.2f tps," $time
        else
-            echo -n "$time"
+            printf "%0.2f tps\n" $time
        fi
    done
    echo ""
 done
@@ -0,0 +1,53 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Logs</title>
    <style>
        body {
            margin: 0;
            height: 100vh;
            display: flex;
            flex-direction: column;
            font-family: "Courier New", Courier, monospace;
        }
        #log-stream {
            flex: 1;
            margin: 1em;
            padding: 10px;
            background: #f4f4f4;
            overflow-y: auto;
            white-space: pre-wrap; /* Ensures line wrapping */
            word-wrap: break-word; /* Ensures long words wrap */
        }
    </style>
 </head>
 <body>
    <pre id="log-stream">Waiting for logs...
 </pre>
    <script>
        // Establish an EventSource connection to the SSE endpoint
        if (typeof(EventSource) !== "undefined") {
            const eventSource = new EventSource("/logs/streamSSE");
            eventSource.onmessage = function(event) {
                // Append the new log message to the <pre> element
                const logStream = document.getElementById('log-stream');
                logStream.textContent += event.data;
                // Auto-scroll to the bottom
                logStream.scrollTop = logStream.scrollHeight;
            };
            eventSource.onerror = function(err) {
                console.error("EventSource failed:", err);
            };
        } else {
            console.error("SSE not supported by this browser.");
        }
    </script>
 </body>
 </html>
@@ -1,19 +1,41 @@
 package proxy
 import (
 	"embed"
 	"fmt"
 	"net/http"
 	"strings"
 	"github.com/gin-gonic/gin"
 )
 //go:embed html/logs.html
 var logsHTML []byte
 // make sure embed is kept there by the IDE auto-package importer
 var _ = embed.FS{}
 func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
-	c.Header("Content-Type", "text/plain")
+
-	history := pm.logMonitor.GetHistory()
+	accept := c.GetHeader("Accept")
-	_, err := c.Writer.Write(history)
+	if strings.Contains(accept, "text/html") {
-	if err != nil {
+		// Set the Content-Type header to text/html
-		c.AbortWithError(http.StatusInternalServerError, err)
+		c.Header("Content-Type", "text/html")
-		return
+
 		// Write the embedded HTML content to the response
 		_, err := c.Writer.Write(logsHTML)
 		if err != nil {
 			c.AbortWithError(http.StatusInternalServerError, fmt.Errorf("failed to write response: %v", err))
 			return
 		}
 	} else {
 		c.Header("Content-Type", "text/plain")
 		history := pm.logMonitor.GetHistory()
 		_, err := c.Writer.Write(history)
 		if err != nil {
 			c.AbortWithError(http.StatusInternalServerError, err)
 			return
 		}
 	}
 }
Author	SHA1	Message	Date
Benson Wong	97dae50dc4	update readme	2024-12-08 21:34:16 -08:00
Benson Wong	cb978f760f	add web interface to /logs	2024-12-08 21:26:22 -08:00
Benson Wong	387f0ef6c4	use new timings data in server response in run-benchmark.sh	2024-12-03 20:48:36 -08:00