Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 97dae50dc4 | |||
| cb978f760f | |||
| 387f0ef6c4 |
@@ -2,20 +2,32 @@
|
|||||||
|
|
||||||

|

|
||||||
|
|
||||||
llama-swap is a golang server that automatically swaps the llama.cpp server on demand. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
|
# Introduction
|
||||||
|
llama-swap is an OpenAI API compatible server that gives you complete control over how you use your hardware. It automatically swaps to the configuration of your choice for serving a model. Since [llama.cpp's server](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) can't swap models, let's swap the server instead!
|
||||||
|
|
||||||
Features:
|
Features:
|
||||||
|
|
||||||
- ✅ Easy to deploy: single binary with no dependencies
|
- ✅ Easy to deploy: single binary with no dependencies
|
||||||
- ✅ Single yaml configuration file
|
- ✅ Single yaml configuration file
|
||||||
- ✅ Automatic switching between models
|
- ✅ On-demand model switching
|
||||||
- ✅ Full control over llama.cpp server settings per model
|
- ✅ Full control over server settings per model
|
||||||
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
||||||
- ✅ Multiple GPU support
|
- ✅ Multiple GPU support
|
||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
- ✅ Automatic unloading of models from GPUs after timeout
|
- ✅ Automatic unloading of models from GPUs after timeout
|
||||||
|
|
||||||
|
## Releases
|
||||||
|
|
||||||
|
Builds for Linux and OSX are available on the [Releases](https://github.com/mostlygeek/llama-swap/releases) page.
|
||||||
|
|
||||||
|
### Building from source
|
||||||
|
|
||||||
|
1. Install golang for your system
|
||||||
|
1. `git clone git@github.com:mostlygeek/llama-swap.git`
|
||||||
|
1. `make clean all`
|
||||||
|
1. Binaries will be in `build/` subdirectory
|
||||||
|
|
||||||
## config.yaml
|
## config.yaml
|
||||||
|
|
||||||
llama-swap's configuration is purposefully simple.
|
llama-swap's configuration is purposefully simple.
|
||||||
@@ -83,22 +95,22 @@ More [examples](examples/README.md) are available for different use cases.
|
|||||||
|
|
||||||
## Monitoring Logs
|
## Monitoring Logs
|
||||||
|
|
||||||
The `/logs` endpoint is available to monitor what llama-swap is doing. It will send the last 10KB of logs. Useful for monitoring the output of llama-server. It also supports streaming of logs.
|
Open the `http://<host>/logs` with your browser to get a web interface with streaming logs.
|
||||||
|
|
||||||
Usage:
|
Of course, CLI access is also supported:
|
||||||
|
|
||||||
```
|
```
|
||||||
# sends up to the last 10KB of logs
|
# sends up to the last 10KB of logs
|
||||||
curl http://host/logs'
|
curl http://host/logs'
|
||||||
|
|
||||||
# streams logs using chunk encoding
|
# streams logs
|
||||||
curl -Ns 'http://host/logs/stream'
|
curl -Ns 'http://host/logs/stream'
|
||||||
|
|
||||||
|
# stream and filter logs with linux pipes
|
||||||
|
curl -Ns http://host/logs/stream | grep 'eval time'
|
||||||
|
|
||||||
# skips history and just streams new log entries
|
# skips history and just streams new log entries
|
||||||
curl -Ns 'http://host/logs/stream?no-history'
|
curl -Ns 'http://host/logs/stream?no-history'
|
||||||
|
|
||||||
# streams logs using Server Sent Events
|
|
||||||
curl -Ns 'http://host/logs/streamSSE'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Systemd Unit Files
|
## Systemd Unit Files
|
||||||
@@ -125,9 +137,3 @@ StartLimitInterval=30
|
|||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
```
|
```
|
||||||
|
|
||||||
## Building from Source
|
|
||||||
|
|
||||||
1. Install golang for your system
|
|
||||||
1. run `make clean all`
|
|
||||||
1. binaries will be built into `build/` directory
|
|
||||||
|
|||||||
@@ -22,22 +22,19 @@ for model in "$@"; do
|
|||||||
echo -n "$model,"
|
echo -n "$model,"
|
||||||
|
|
||||||
for lang in "python" "typescript" "swift"; do
|
for lang in "python" "typescript" "swift"; do
|
||||||
response=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"temperature\": 0.1, \"model\":\"$model\"}")
|
# expects a llama.cpp after PR https://github.com/ggerganov/llama.cpp/pull/10548
|
||||||
|
# (Dec 3rd/2024)
|
||||||
|
time=$(curl -s --url "$url/v1/chat/completions" -d "{\"messages\": [{\"role\": \"system\", \"content\": \"you only write code.\"}, {\"role\": \"user\", \"content\": \"write snake game in $lang\"}], \"top_k\": 1, \"timings_per_token\":true, \"model\":\"$model\"}" | jq -r .timings.predicted_per_second)
|
||||||
|
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
time="error"
|
time="error"
|
||||||
else
|
exit 1
|
||||||
time=$(curl -s --url "$url/logs" | grep -oE '\d+(?:\.\d+)? tokens per second' | awk '{print $1}' | tail -n 1)
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
time="error"
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$lang" != "swift" ]; then
|
if [ "$lang" != "swift" ]; then
|
||||||
echo -n "$time,"
|
printf "%0.2f tps," $time
|
||||||
else
|
else
|
||||||
echo -n "$time"
|
printf "%0.2f tps\n" $time
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
echo ""
|
|
||||||
done
|
done
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Logs</title>
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
margin: 0;
|
||||||
|
height: 100vh;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
font-family: "Courier New", Courier, monospace;
|
||||||
|
}
|
||||||
|
#log-stream {
|
||||||
|
flex: 1;
|
||||||
|
margin: 1em;
|
||||||
|
padding: 10px;
|
||||||
|
background: #f4f4f4;
|
||||||
|
overflow-y: auto;
|
||||||
|
white-space: pre-wrap; /* Ensures line wrapping */
|
||||||
|
word-wrap: break-word; /* Ensures long words wrap */
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<pre id="log-stream">Waiting for logs...
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Establish an EventSource connection to the SSE endpoint
|
||||||
|
if (typeof(EventSource) !== "undefined") {
|
||||||
|
const eventSource = new EventSource("/logs/streamSSE");
|
||||||
|
|
||||||
|
eventSource.onmessage = function(event) {
|
||||||
|
// Append the new log message to the <pre> element
|
||||||
|
const logStream = document.getElementById('log-stream');
|
||||||
|
|
||||||
|
logStream.textContent += event.data;
|
||||||
|
|
||||||
|
// Auto-scroll to the bottom
|
||||||
|
logStream.scrollTop = logStream.scrollHeight;
|
||||||
|
};
|
||||||
|
|
||||||
|
eventSource.onerror = function(err) {
|
||||||
|
console.error("EventSource failed:", err);
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
console.error("SSE not supported by this browser.");
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -1,19 +1,41 @@
|
|||||||
package proxy
|
package proxy
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"embed"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
//go:embed html/logs.html
|
||||||
|
var logsHTML []byte
|
||||||
|
|
||||||
|
// make sure embed is kept there by the IDE auto-package importer
|
||||||
|
var _ = embed.FS{}
|
||||||
|
|
||||||
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
|
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
|
||||||
c.Header("Content-Type", "text/plain")
|
|
||||||
history := pm.logMonitor.GetHistory()
|
accept := c.GetHeader("Accept")
|
||||||
_, err := c.Writer.Write(history)
|
if strings.Contains(accept, "text/html") {
|
||||||
if err != nil {
|
// Set the Content-Type header to text/html
|
||||||
c.AbortWithError(http.StatusInternalServerError, err)
|
c.Header("Content-Type", "text/html")
|
||||||
return
|
|
||||||
|
// Write the embedded HTML content to the response
|
||||||
|
_, err := c.Writer.Write(logsHTML)
|
||||||
|
if err != nil {
|
||||||
|
c.AbortWithError(http.StatusInternalServerError, fmt.Errorf("failed to write response: %v", err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
c.Header("Content-Type", "text/plain")
|
||||||
|
history := pm.logMonitor.GetHistory()
|
||||||
|
_, err := c.Writer.Write(history)
|
||||||
|
if err != nil {
|
||||||
|
c.AbortWithError(http.StatusInternalServerError, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user