Compare commits

..

4 Commits

Author SHA1 Message Date
Benson Wong 5dc6b3e6d9 Add barebones but working implementation of model preload (#209, #235)
Add barebones but working implementation of model preload

* add config test for Preload hook
* improve TestProxyManager_StartupHooks
* docs for new hook configuration
* add a .dev to .gitignore
2025-08-14 10:27:28 -07:00
Benson Wong 74c69f39ef Add prompt processing metrics (#250)
- capture prompt processing metrics
- display prompt processing metrics on UI Activity page
2025-08-14 10:02:16 -07:00
Benson Wong a186318892 Update Readme, Add screenshot for Activities page [skip ci] 2025-08-08 13:39:46 -07:00
Benson Wong c4e4d5e1e9 Update Readme UI Screenshot [skip ci] 2025-08-08 13:33:47 -07:00
14 changed files with 212 additions and 15 deletions
+1
View File
@@ -4,3 +4,4 @@ build/
dist/ dist/
.vscode .vscode
.DS_Store .DS_Store
.dev/
+8 -3
View File
@@ -31,8 +31,9 @@ Written in golang, it is very easy to install (single binary with no dependencie
- ✅ Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107)) - ✅ Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
- ✅ Automatic unloading of models after timeout by setting a `ttl` - ✅ Automatic unloading of models after timeout by setting a `ttl`
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc) - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
- ✅ Docker and Podman support - Reliable Docker and Podman support with `cmdStart` and `cmdStop`
- ✅ Full control over server settings per model - ✅ Full control over server settings per model
- ✅ Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
## How does llama-swap work? ## How does llama-swap work?
@@ -71,9 +72,13 @@ See the [configuration documentation](https://github.com/mostlygeek/llama-swap/w
## Web UI ## Web UI
llama-swap ships with a real time web interface to monitor logs and status of models: llama-swap includes a real time web interface for monitoring logs and models:
<img width="1786" height="1334" alt="image" src="https://github.com/user-attachments/assets/d6258cb9-1dad-40db-828f-2be860aec8fe" /> <img width="1360" height="963" alt="image" src="https://github.com/user-attachments/assets/adef4a8e-de0b-49db-885a-8f6dedae6799" />
The Activity Page shows recent requests:
<img width="1360" height="963" alt="image" src="https://github.com/user-attachments/assets/5f3edee6-d03a-4ae5-ae06-b20ac1f135bd" />
## Installation ## Installation
+23
View File
@@ -1,6 +1,13 @@
# llama-swap YAML configuration example # llama-swap YAML configuration example
# ------------------------------------- # -------------------------------------
# #
# 💡 Tip - Use an LLM with this file!
# ====================================
# This example configuration is written to be LLM friendly! Try
# copying this file into an LLM and asking it to explain or generate
# sections for you.
# ====================================
#
# - Below are all the available configuration options for llama-swap. # - Below are all the available configuration options for llama-swap.
# - Settings with a default value, or noted as optional can be omitted. # - Settings with a default value, or noted as optional can be omitted.
# - Settings that are marked required must be in your configuration file # - Settings that are marked required must be in your configuration file
@@ -207,3 +214,19 @@ groups:
- "forever-modelA" - "forever-modelA"
- "forever-modelB" - "forever-modelB"
- "forever-modelc" - "forever-modelc"
# hooks: a dictionary of event triggers and actions
# - optional, default: empty dictionary
# - the only supported hook is on_startup
hooks:
# on_startup: a dictionary of actions to perform on startup
# - optional, default: empty dictionar
# - the only supported action is preload
on_startup:
# preload: a list of model ids to load on startup
# - optional, default: empty list
# - model names must match keys in the models sections
# - when preloading multiple models at once, define a group
# otherwise models will be loaded and swapped out
preload:
- "llama"
+27
View File
@@ -138,6 +138,14 @@ func (c *GroupConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
return nil return nil
} }
type HooksConfig struct {
OnStartup HookOnStartup `yaml:"on_startup"`
}
type HookOnStartup struct {
Preload []string `yaml:"preload"`
}
type Config struct { type Config struct {
HealthCheckTimeout int `yaml:"healthCheckTimeout"` HealthCheckTimeout int `yaml:"healthCheckTimeout"`
LogRequests bool `yaml:"logRequests"` LogRequests bool `yaml:"logRequests"`
@@ -155,6 +163,9 @@ type Config struct {
// automatic port assignments // automatic port assignments
StartPort int `yaml:"startPort"` StartPort int `yaml:"startPort"`
// hooks, see: #209
Hooks HooksConfig `yaml:"hooks"`
} }
func (c *Config) RealModelName(search string) (string, bool) { func (c *Config) RealModelName(search string) (string, bool) {
@@ -330,6 +341,22 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
} }
} }
// clean up hooks preload
if len(config.Hooks.OnStartup.Preload) > 0 {
var toPreload []string
for _, modelID := range config.Hooks.OnStartup.Preload {
modelID = strings.TrimSpace(modelID)
if modelID == "" {
continue
}
if real, found := config.RealModelName(modelID); found {
toPreload = append(toPreload, real)
}
}
config.Hooks.OnStartup.Preload = toPreload
}
return config, nil return config, nil
} }
+8
View File
@@ -100,6 +100,9 @@ func TestConfig_LoadPosix(t *testing.T) {
content := ` content := `
macros: macros:
svr-path: "path/to/server" svr-path: "path/to/server"
hooks:
on_startup:
preload: ["model1", "model2"]
models: models:
model1: model1:
cmd: path/to/cmd --arg1 one cmd: path/to/cmd --arg1 one
@@ -163,6 +166,11 @@ groups:
Macros: map[string]string{ Macros: map[string]string{
"svr-path": "path/to/server", "svr-path": "path/to/server",
}, },
Hooks: HooksConfig{
OnStartup: HookOnStartup{
Preload: []string{"model1", "model2"},
},
},
Models: map[string]ModelConfig{ Models: map[string]ModelConfig{
"model1": { "model1": {
Cmd: "path/to/cmd --arg1 one", Cmd: "path/to/cmd --arg1 one",
+27
View File
@@ -0,0 +1,27 @@
package proxy
import "net/http"
// Custom discard writer that implements http.ResponseWriter but just discards everything
type DiscardWriter struct {
header http.Header
status int
}
func (w *DiscardWriter) Header() http.Header {
if w.header == nil {
w.header = make(http.Header)
}
return w.header
}
func (w *DiscardWriter) Write(data []byte) (int, error) {
return len(data), nil
}
func (w *DiscardWriter) WriteHeader(code int) {
w.status = code
}
// Satisfy the http.Flusher interface for streaming responses
func (w *DiscardWriter) Flush() {}
+10
View File
@@ -7,6 +7,7 @@ const ChatCompletionStatsEventID = 0x02
const ConfigFileChangedEventID = 0x03 const ConfigFileChangedEventID = 0x03
const LogDataEventID = 0x04 const LogDataEventID = 0x04
const TokenMetricsEventID = 0x05 const TokenMetricsEventID = 0x05
const ModelPreloadedEventID = 0x06
type ProcessStateChangeEvent struct { type ProcessStateChangeEvent struct {
ProcessName string ProcessName string
@@ -48,3 +49,12 @@ type LogDataEvent struct {
func (e LogDataEvent) Type() uint32 { func (e LogDataEvent) Type() uint32 {
return LogDataEventID return LogDataEventID
} }
type ModelPreloadedEvent struct {
ModelName string
Success bool
}
func (e ModelPreloadedEvent) Type() uint32 {
return ModelPreloadedEventID
}
+2 -3
View File
@@ -16,6 +16,7 @@ var (
nextTestPort int = 12000 nextTestPort int = 12000
portMutex sync.Mutex portMutex sync.Mutex
testLogger = NewLogMonitorWriter(os.Stdout) testLogger = NewLogMonitorWriter(os.Stdout)
simpleResponderPath = getSimpleResponderPath()
) )
// Check if the binary exists // Check if the binary exists
@@ -69,13 +70,11 @@ func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
} }
func getTestSimpleResponderConfigPort(expectedMessage string, port int) ModelConfig { func getTestSimpleResponderConfigPort(expectedMessage string, port int) ModelConfig {
binaryPath := getSimpleResponderPath()
// Create a YAML string with just the values we want to set // Create a YAML string with just the values we want to set
yamlStr := fmt.Sprintf(` yamlStr := fmt.Sprintf(`
cmd: '%s --port %d --silent --respond %s' cmd: '%s --port %d --silent --respond %s'
proxy: "http://127.0.0.1:%d" proxy: "http://127.0.0.1:%d"
`, binaryPath, port, expectedMessage, port) `, simpleResponderPath, port, expectedMessage, port)
var cfg ModelConfig var cfg ModelConfig
if err := yaml.Unmarshal([]byte(yamlStr), &cfg); err != nil { if err := yaml.Unmarshal([]byte(yamlStr), &cfg); err != nil {
+3
View File
@@ -79,10 +79,12 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
outputTokens := int(jsonData.Get("usage.completion_tokens").Int()) outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
inputTokens := int(jsonData.Get("usage.prompt_tokens").Int()) inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
tokensPerSecond := -1.0 tokensPerSecond := -1.0
promptPerSecond := -1.0
durationMs := int(time.Since(rec.startTime).Milliseconds()) durationMs := int(time.Since(rec.startTime).Milliseconds())
// use llama-server's timing data for tok/sec and duration as it is more accurate // use llama-server's timing data for tok/sec and duration as it is more accurate
if timings := jsonData.Get("timings"); timings.Exists() { if timings := jsonData.Get("timings"); timings.Exists() {
promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float() tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float()) durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
} }
@@ -92,6 +94,7 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
Model: rec.realModelName, Model: rec.realModelName,
InputTokens: inputTokens, InputTokens: inputTokens,
OutputTokens: outputTokens, OutputTokens: outputTokens,
PromptPerSecond: promptPerSecond,
TokensPerSecond: tokensPerSecond, TokensPerSecond: tokensPerSecond,
DurationMs: durationMs, DurationMs: durationMs,
}) })
+1
View File
@@ -15,6 +15,7 @@ type TokenMetrics struct {
Model string `json:"model"` Model string `json:"model"`
InputTokens int `json:"input_tokens"` InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"` OutputTokens int `json:"output_tokens"`
PromptPerSecond float64 `json:"prompt_per_second"`
TokensPerSecond float64 `json:"tokens_per_second"` TokensPerSecond float64 `json:"tokens_per_second"`
DurationMs int `json:"duration_ms"` DurationMs int `json:"duration_ms"`
} }
+30
View File
@@ -15,6 +15,7 @@ import (
"time" "time"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/mostlygeek/llama-swap/event"
"github.com/tidwall/gjson" "github.com/tidwall/gjson"
"github.com/tidwall/sjson" "github.com/tidwall/sjson"
) )
@@ -96,6 +97,35 @@ func New(config Config) *ProxyManager {
} }
pm.setupGinEngine() pm.setupGinEngine()
// run any startup hooks
if len(config.Hooks.OnStartup.Preload) > 0 {
// do it in the background, don't block startup -- not sure if good idea yet
go func() {
discardWriter := &DiscardWriter{}
for _, realModelName := range config.Hooks.OnStartup.Preload {
proxyLogger.Infof("Preloading model: %s", realModelName)
processGroup, _, err := pm.swapProcessGroup(realModelName)
if err != nil {
event.Emit(ModelPreloadedEvent{
ModelName: realModelName,
Success: false,
})
proxyLogger.Errorf("Failed to preload model %s: %v", realModelName, err)
continue
} else {
req, _ := http.NewRequest("GET", "/", nil)
processGroup.ProxyRequest(realModelName, discardWriter, req)
event.Emit(ModelPreloadedEvent{
ModelName: realModelName,
Success: true,
})
}
}
}()
}
return pm return pm
} }
+60
View File
@@ -14,6 +14,7 @@ import (
"testing" "testing"
"time" "time"
"github.com/mostlygeek/llama-swap/event"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/tidwall/gjson" "github.com/tidwall/gjson"
) )
@@ -832,3 +833,62 @@ func TestProxyManager_HealthEndpoint(t *testing.T) {
assert.Equal(t, http.StatusOK, rec.Code) assert.Equal(t, http.StatusOK, rec.Code)
assert.Equal(t, "OK", rec.Body.String()) assert.Equal(t, "OK", rec.Body.String())
} }
func TestProxyManager_StartupHooks(t *testing.T) {
// using real YAML as the configuration has gotten more complex
// is the right approach as LoadConfigFromReader() does a lot more
// than parse YAML now. Eventually migrate all tests to use this approach
configStr := strings.Replace(`
logLevel: error
hooks:
on_startup:
preload:
- model1
- model2
groups:
preloadTestGroup:
swap: false
members:
- model1
- model2
models:
model1:
cmd: ${simpleresponderpath} --port ${PORT} --silent --respond model1
model2:
cmd: ${simpleresponderpath} --port ${PORT} --silent --respond model2
`, "${simpleresponderpath}", simpleResponderPath, -1)
// Create a test model configuration
config, err := LoadConfigFromReader(strings.NewReader(configStr))
if !assert.NoError(t, err, "Invalid configuration") {
return
}
preloadChan := make(chan ModelPreloadedEvent, 2) // buffer for 2 expected events
unsub := event.On(func(e ModelPreloadedEvent) {
preloadChan <- e
})
defer unsub()
// Create the proxy which should trigger preloading
proxy := New(config)
defer proxy.StopProcesses(StopWaitForInflightRequest)
for i := 0; i < 2; i++ {
select {
case <-preloadChan:
case <-time.After(5 * time.Second):
t.Fatal("timed out waiting for models to preload")
}
}
// make sure they are both loaded
_, foundGroup := proxy.processGroups["preloadTestGroup"]
if !assert.True(t, foundGroup, "preloadTestGroup should exist") {
return
}
assert.Equal(t, StateReady, proxy.processGroups["preloadTestGroup"].processes["model1"].CurrentState())
assert.Equal(t, StateReady, proxy.processGroups["preloadTestGroup"].processes["model2"].CurrentState())
}
+1
View File
@@ -28,6 +28,7 @@ interface Metrics {
model: string; model: string;
input_tokens: number; input_tokens: number;
output_tokens: number; output_tokens: number;
prompt_per_second: number;
tokens_per_second: number; tokens_per_second: number;
duration_ms: number; duration_ms: number;
} }
+2
View File
@@ -51,6 +51,7 @@ const ActivityPage = () => {
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
</tr> </tr>
@@ -62,6 +63,7 @@ const ActivityPage = () => {
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td> <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td> <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td> <td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.tokens_per_second)}</td> <td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.tokens_per_second)}</td>
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatDuration(metric.duration_ms)}</td> <td className="px-6 py-4 whitespace-nowrap text-sm">{formatDuration(metric.duration_ms)}</td>
</tr> </tr>