Compare commits

...

4 Commits

Author SHA1 Message Date
Benson Wong 5fbd53c616 delay TTL check until after all requests are complete (#25)
- fixes #25 where requests that last longer than the TTL will cause the
  process to be unloaded before the next request.
- new behavior, TTL waits until all requests are complete before
  checking timeout
2024-12-09 19:08:03 -08:00
Benson Wong 97dae50dc4 update readme 2024-12-08 21:34:16 -08:00
Benson Wong cb978f760f add web interface to /logs 2024-12-08 21:26:22 -08:00
Benson Wong 387f0ef6c4 use new timings data in server response in run-benchmark.sh 2024-12-03 20:48:36 -08:00
6 changed files with 136 additions and 44 deletions
+21 -15
View File
@@ -2,20 +2,32 @@
![llama-swap header image](header.jpeg)
llama-swap is a golang server that automatically swaps the llama.cpp server on demand. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
# Introduction
llama-swap is an OpenAI API compatible server that gives you complete control over how you use your hardware. It automatically swaps to the configuration of your choice for serving a model. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
Features:
- ✅ Easy to deploy: single binary with no dependencies
- ✅ Single yaml configuration file
-Automatic switching between models
- ✅ Full control over llama.cpp server settings per model
-On-demand model switching
- ✅ Full control over server settings per model
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
- ✅ Multiple GPU support
- ✅ Run multiple models at once with `profiles`
- ✅ Remote log monitoring at `/log`
- ✅ Automatic unloading of models from GPUs after timeout
## Releases
Builds for Linux and OSX are available on the [Releases](https://github.com/mostlygeek/llama-swap/releases) page.
### Building from source
1. Install golang for your system
1. `git clone git@github.com:mostlygeek/llama-swap.git`
1. `make clean all`
1. Binaries will be in `build/` subdirectory
## config.yaml
llama-swap's configuration is purposefully simple.
@@ -83,22 +95,22 @@ More [examples](examples/README.md) are available for different use cases.
## Monitoring Logs
The `/logs` endpoint is available to monitor what llama-swap is doing. It will send the last 10KB of logs. Useful for monitoring the output of llama-server. It also supports streaming of logs.
Open the `http://<host>/logs` with your browser to get a web interface with streaming logs.
Usage:
Of course, CLI access is also supported:
```
# sends up to the last 10KB of logs
curl http://host/logs'
# streams logs using chunk encoding
# streams logs
curl -Ns 'http://host/logs/stream'
# stream and filter logs with linux pipes
curl -Ns http://host/logs/stream | grep 'eval time'
# skips history and just streams new log entries
curl -Ns 'http://host/logs/stream?no-history'
# streams logs using Server Sent Events
curl -Ns 'http://host/logs/streamSSE'
```
## Systemd Unit Files
@@ -125,9 +137,3 @@ StartLimitInterval=30
[Install]
WantedBy=multi-user.target
```
## Building from Source
1. Install golang for your system
1. run `make clean all`
1. binaries will be built into `build/` directory
+8 -11
View File
@@ -22,22 +22,19 @@ for model in "$@"; do
echo -n "$model,"
for lang in "python" "typescript" "swift"; do
response=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"temperature\": 0.1, \"model\":\"$model\"}")
# expects a llama.cpp after PR https://github.com/ggerganov/llama.cpp/pull/10548
# (Dec 3rd/2024)
time=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"top_k\": 1, \"timings_per_token\":true, \"model\":\"$model\"}" | jq -r .timings.predicted_per_second)
if [ $? -ne 0 ]; then
time="error"
else
time=$(curl -s --url "$url/logs" | grep -oE '\d+(?:\.\d+)? tokens per second' | awk '{print $1}' | tail -n 1)
if [ $? -ne 0 ]; then
time="error"
fi
exit 1
fi
if [ "$lang" != "swift" ]; then
echo -n "$time,"
printf "%0.2f tps," $time
else
echo -n "$time"
printf "%0.2f tps\n" $time
fi
done
echo ""
done
done
+53
View File
@@ -0,0 +1,53 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Logs</title>
<style>
body {
margin: 0;
height: 100vh;
display: flex;
flex-direction: column;
font-family: "Courier New", Courier, monospace;
}
#log-stream {
flex: 1;
margin: 1em;
padding: 10px;
background: #f4f4f4;
overflow-y: auto;
white-space: pre-wrap; /* Ensures line wrapping */
word-wrap: break-word; /* Ensures long words wrap */
}
</style>
</head>
<body>
<pre id="log-stream">Waiting for logs...
</pre>
<script>
// Establish an EventSource connection to the SSE endpoint
if (typeof(EventSource) !== "undefined") {
const eventSource = new EventSource("/logs/streamSSE");
eventSource.onmessage = function(event) {
// Append the new log message to the <pre> element
const logStream = document.getElementById('log-stream');
logStream.textContent += event.data;
// Auto-scroll to the bottom
logStream.scrollTop = logStream.scrollHeight;
};
eventSource.onerror = function(err) {
console.error("EventSource failed:", err);
};
} else {
console.error("SSE not supported by this browser.");
}
</script>
</body>
</html>
+9 -8
View File
@@ -122,16 +122,15 @@ func (p *Process) start() error {
// start a goroutine to check every second if
// the process should be stopped
go func() {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
maxDuration := time.Duration(p.config.UnloadAfter) * time.Second
for {
<-ticker.C
for range time.Tick(time.Second) {
// wait for all inflight requests to complete and ticker
p.inFlightRequests.Wait()
if time.Since(p.lastRequestHandled) > maxDuration {
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %d reached.\n", p.ID, p.config.UnloadAfter)
p.Stop()
return
}
}
}()
@@ -275,7 +274,11 @@ func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error {
func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
p.inFlightRequests.Add(1)
defer p.inFlightRequests.Done()
defer func() {
p.lastRequestHandled = time.Now()
p.inFlightRequests.Done()
}()
if p.CurrentState() != StateReady {
if err := p.start(); err != nil {
@@ -285,8 +288,6 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
}
}
p.lastRequestHandled = time.Now()
proxyTo := p.config.Proxy
client := &http.Client{}
req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
+17 -4
View File
@@ -82,18 +82,31 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
process := NewProcess("ttl", 2, config, NewLogMonitorWriter(io.Discard))
defer process.Stop()
req := httptest.NewRequest("GET", "/test", nil)
// this should take 4 seconds
req1 := httptest.NewRequest("GET", "/slow-respond?echo=1234&delay=1000ms", nil)
req2 := httptest.NewRequest("GET", "/test", nil)
w := httptest.NewRecorder()
// Proxy the request (auto start)
process.ProxyRequest(w, req)
// Proxy the request (auto start) with a slow response that takes longer than config.UnloadAfter
process.ProxyRequest(w, req1)
t.Log("sending slow first request (4 seconds)")
assert.Equal(t, http.StatusOK, w.Code, "Expected status code %d, got %d", http.StatusOK, w.Code)
assert.Contains(t, w.Body.String(), "1234")
assert.Equal(t, StateReady, process.CurrentState())
// ensure the TTL timeout does not race slow requests (see issue #25)
t.Log("sending second request (1 second)")
time.Sleep(time.Second)
w = httptest.NewRecorder()
process.ProxyRequest(w, req2)
assert.Equal(t, http.StatusOK, w.Code, "Expected status code %d, got %d", http.StatusOK, w.Code)
assert.Contains(t, w.Body.String(), expectedMessage)
assert.Equal(t, StateReady, process.CurrentState())
// wait 5 seconds
t.Log("sleep 5 seconds and check if unloaded")
time.Sleep(5 * time.Second)
assert.Equal(t, StateStopped, process.CurrentState())
}
+28 -6
View File
@@ -1,19 +1,41 @@
package proxy
import (
"embed"
"fmt"
"net/http"
"strings"
"github.com/gin-gonic/gin"
)
//go:embed html/logs.html
var logsHTML []byte
// make sure embed is kept there by the IDE auto-package importer
var _ = embed.FS{}
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
c.Header("Content-Type", "text/plain")
history := pm.logMonitor.GetHistory()
_, err := c.Writer.Write(history)
if err != nil {
c.AbortWithError(http.StatusInternalServerError, err)
return
accept := c.GetHeader("Accept")
if strings.Contains(accept, "text/html") {
// Set the Content-Type header to text/html
c.Header("Content-Type", "text/html")
// Write the embedded HTML content to the response
_, err := c.Writer.Write(logsHTML)
if err != nil {
c.AbortWithError(http.StatusInternalServerError, fmt.Errorf("failed to write response: %v", err))
return
}
} else {
c.Header("Content-Type", "text/plain")
history := pm.logMonitor.GetHistory()
_, err := c.Writer.Write(history)
if err != nil {
c.AbortWithError(http.StatusInternalServerError, err)
return
}
}
}