Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 5fad24c16f | |||
| 8404244fab | |||
| 712cd01081 | |||
| 1f7aa359b1 | |||
| b138d6cf25 | |||
| fb7c808082 | |||
| a7e640b0f7 |
@@ -1,4 +1,4 @@
|
|||||||

|

|
||||||

|

|
||||||

|

|
||||||

|

|
||||||
@@ -269,8 +269,4 @@ WantedBy=multi-user.target
|
|||||||
|
|
||||||
## Star History
|
## Star History
|
||||||
|
|
||||||
<picture>
|
[](https://www.star-history.com/#mostlygeek/llama-swap&Date)
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date&theme=dark" />
|
|
||||||
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date" />
|
|
||||||
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=mostlygeek/llama-swap&type=Date" />
|
|
||||||
</picture>
|
|
||||||
|
|||||||
+1
-1
@@ -3,7 +3,7 @@
|
|||||||
healthCheckTimeout: 90
|
healthCheckTimeout: 90
|
||||||
|
|
||||||
# valid log levels: debug, info (default), warn, error
|
# valid log levels: debug, info (default), warn, error
|
||||||
logLevel: info
|
logLevel: debug
|
||||||
|
|
||||||
models:
|
models:
|
||||||
"llama":
|
"llama":
|
||||||
|
|||||||
@@ -0,0 +1,153 @@
|
|||||||
|
# aider, QwQ, Qwen-Coder 2.5 and llama-swap
|
||||||
|
|
||||||
|
This guide show how to use aider and llama-swap to get a 100% local coding co-pilot setup. The focus is on the trickest part which is configuring aider, llama-swap and llama-server to work together.
|
||||||
|
|
||||||
|
## Here's what you you need:
|
||||||
|
|
||||||
|
- aider - [installation docs](https://aider.chat/docs/install.html)
|
||||||
|
- llama-server - [download latest release](https://github.com/ggml-org/llama.cpp/releases)
|
||||||
|
- llama-swap - [download latest release](https://github.com/mostlygeek/llama-swap/releases)
|
||||||
|
- [QwQ 32B](https://huggingface.co/bartowski/Qwen_QwQ-32B-GGUF) and [Qwen Coder 2.5 32B](https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF) models
|
||||||
|
- 24GB VRAM video card
|
||||||
|
|
||||||
|
## Running aider
|
||||||
|
|
||||||
|
The goal is getting this command line to work:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
aider --architect \
|
||||||
|
--no-show-model-warnings \
|
||||||
|
--model openai/QwQ \
|
||||||
|
--editor-model openai/qwen-coder-32B \
|
||||||
|
--model-settings-file aider.model.settings.yml \
|
||||||
|
--openai-api-key "sk-na" \
|
||||||
|
--openai-api-base "http://10.0.1.24:8080/v1" \
|
||||||
|
```
|
||||||
|
|
||||||
|
Set `--openai-api-base` to the IP and port where your llama-swap is running.
|
||||||
|
|
||||||
|
## Create an aider model settings file
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# aider.model.settings.yml
|
||||||
|
|
||||||
|
#
|
||||||
|
# !!! important: model names must match llama-swap configuration names !!!
|
||||||
|
#
|
||||||
|
|
||||||
|
- name: "openai/QwQ"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.95
|
||||||
|
top_k: 40
|
||||||
|
presence_penalty: 0.1
|
||||||
|
repetition_penalty: 1
|
||||||
|
num_ctx: 16384
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
weak_model_name: "openai/qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
|
||||||
|
- name: "openai/qwen-coder-32B"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
repetition_penalty: 1.05
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
editor_edit_format: editor-diff
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
```
|
||||||
|
|
||||||
|
## llama-swap configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# config.yaml
|
||||||
|
|
||||||
|
# The parameters are tweaked to fit model+context into 24GB VRAM GPUs
|
||||||
|
models:
|
||||||
|
"qwen-coder-32B":
|
||||||
|
proxy: "http://127.0.0.1:8999"
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 8999 --flash-attn --slots
|
||||||
|
--ctx-size 16000
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
-ngl 99
|
||||||
|
--model /path/to/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf
|
||||||
|
|
||||||
|
"QwQ":
|
||||||
|
proxy: "http://127.0.0.1:9503"
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 9503 --flash-attn --metrics--slots
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
--ctx-size 32000
|
||||||
|
--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
|
||||||
|
--temp 0.6 --repeat-penalty 1.1 --dry-multiplier 0.5
|
||||||
|
--min-p 0.01 --top-k 40 --top-p 0.95
|
||||||
|
-ngl 99
|
||||||
|
--model /mnt/nvme/models/bartowski/Qwen_QwQ-32B-Q4_K_M.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced, Dual GPU Configuration
|
||||||
|
|
||||||
|
If you have _dual 24GB GPUs_ you can use llama-swap profiles to avoid swapping between QwQ and Qwen Coder.
|
||||||
|
|
||||||
|
In llama-swap's configuration file:
|
||||||
|
|
||||||
|
1. add a `profiles` section with `aider` as the profile name
|
||||||
|
2. using the `env` field to specify the GPU IDs for each model
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# config.yaml
|
||||||
|
|
||||||
|
# Add a profile for aider
|
||||||
|
profiles:
|
||||||
|
aider:
|
||||||
|
- qwen-coder-32B
|
||||||
|
- QwQ
|
||||||
|
|
||||||
|
models:
|
||||||
|
"qwen-coder-32B":
|
||||||
|
# manually set the GPU to run on
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=0"
|
||||||
|
proxy: "http://127.0.0.1:8999"
|
||||||
|
cmd: /path/to/llama-server ...
|
||||||
|
|
||||||
|
"QwQ":
|
||||||
|
# manually set the GPU to run on
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=1"
|
||||||
|
proxy: "http://127.0.0.1:9503"
|
||||||
|
cmd: /path/to/llama-server ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Append the profile tag, `aider:`, to the model names in the model settings file
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# aider.model.settings.yml
|
||||||
|
- name: "openai/aider:QwQ"
|
||||||
|
weak_model_name: "openai/aider:qwen-coder-32B-aider"
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B-aider"
|
||||||
|
|
||||||
|
- name: "openai/aider:qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B-aider"
|
||||||
|
```
|
||||||
|
|
||||||
|
Run aider with:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ aider --architect \
|
||||||
|
--no-show-model-warnings \
|
||||||
|
--model openai/aider:QwQ \
|
||||||
|
--editor-model openai/aider:qwen-coder-32B \
|
||||||
|
--config aider.conf.yml \
|
||||||
|
--model-settings-file aider.model.settings.yml
|
||||||
|
--openai-api-key "sk-na" \
|
||||||
|
--openai-api-base "http://10.0.1.24:8080/v1"
|
||||||
|
```
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
# this makes use of llama-swap's profile feature to
|
||||||
|
# keep the architect and editor models in VRAM on different GPUs
|
||||||
|
|
||||||
|
- name: "openai/aider:QwQ"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.95
|
||||||
|
top_k: 40
|
||||||
|
presence_penalty: 0.1
|
||||||
|
repetition_penalty: 1
|
||||||
|
num_ctx: 16384
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
weak_model_name: "openai/aider:qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B"
|
||||||
|
|
||||||
|
- name: "openai/aider:qwen-coder-32B"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
repetition_penalty: 1.05
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
editor_edit_format: editor-diff
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B"
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
- name: "openai/QwQ"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.95
|
||||||
|
top_k: 40
|
||||||
|
presence_penalty: 0.1
|
||||||
|
repetition_penalty: 1
|
||||||
|
num_ctx: 16384
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
weak_model_name: "openai/qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
|
||||||
|
- name: "openai/qwen-coder-32B"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
repetition_penalty: 1.05
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
editor_edit_format: editor-diff
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
healthCheckTimeout: 300
|
||||||
|
logLevel: debug
|
||||||
|
|
||||||
|
profiles:
|
||||||
|
aider:
|
||||||
|
- qwen-coder-32B
|
||||||
|
- QwQ
|
||||||
|
|
||||||
|
models:
|
||||||
|
"qwen-coder-32B":
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=0"
|
||||||
|
aliases:
|
||||||
|
- coder
|
||||||
|
proxy: "http://127.0.0.1:8999"
|
||||||
|
|
||||||
|
# set appropriate paths for your environment
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 8999 --flash-attn --slots
|
||||||
|
--ctx-size 16000
|
||||||
|
--ctx-size-draft 16000
|
||||||
|
--model /path/to/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf
|
||||||
|
--model-draft /path/to/Qwen2.5-Coder-1.5B-Instruct-Q8_0.gguf
|
||||||
|
-ngl 99 -ngld 99
|
||||||
|
--draft-max 16 --draft-min 4 --draft-p-min 0.4
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
"QwQ":
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=1"
|
||||||
|
proxy: "http://127.0.0.1:9503"
|
||||||
|
|
||||||
|
# set appropriate paths for your environment
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 9503
|
||||||
|
--flash-attn --metrics
|
||||||
|
--slots
|
||||||
|
--model /path/to/Qwen_QwQ-32B-Q4_K_M.gguf
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
--ctx-size 32000
|
||||||
|
--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
|
||||||
|
--temp 0.6
|
||||||
|
--repeat-penalty 1.1
|
||||||
|
--dry-multiplier 0.5
|
||||||
|
--min-p 0.01
|
||||||
|
--top-k 40
|
||||||
|
--top-p 0.95
|
||||||
|
-ngl 99 -ngld 99
|
||||||
@@ -37,7 +37,7 @@ require (
|
|||||||
github.com/ugorji/go/codec v1.2.12 // indirect
|
github.com/ugorji/go/codec v1.2.12 // indirect
|
||||||
golang.org/x/arch v0.8.0 // indirect
|
golang.org/x/arch v0.8.0 // indirect
|
||||||
golang.org/x/crypto v0.36.0 // indirect
|
golang.org/x/crypto v0.36.0 // indirect
|
||||||
golang.org/x/net v0.37.0 // indirect
|
golang.org/x/net v0.38.0 // indirect
|
||||||
golang.org/x/sys v0.31.0 // indirect
|
golang.org/x/sys v0.31.0 // indirect
|
||||||
golang.org/x/text v0.23.0 // indirect
|
golang.org/x/text v0.23.0 // indirect
|
||||||
google.golang.org/protobuf v1.34.1 // indirect
|
google.golang.org/protobuf v1.34.1 // indirect
|
||||||
|
|||||||
@@ -86,6 +86,8 @@ golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
|
|||||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||||
golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c=
|
golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c=
|
||||||
golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
|
golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
|
||||||
|
golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8=
|
||||||
|
golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8=
|
||||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
|
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
|
||||||
|
|||||||
BIN
Binary file not shown.
|
After Width: | Height: | Size: 351 KiB |
+50
-10
@@ -34,6 +34,9 @@ type Process struct {
|
|||||||
config ModelConfig
|
config ModelConfig
|
||||||
cmd *exec.Cmd
|
cmd *exec.Cmd
|
||||||
|
|
||||||
|
// for p.cmd.Wait() select { ... }
|
||||||
|
cmdWaitChan chan error
|
||||||
|
|
||||||
processLogger *LogMonitor
|
processLogger *LogMonitor
|
||||||
proxyLogger *LogMonitor
|
proxyLogger *LogMonitor
|
||||||
|
|
||||||
@@ -61,6 +64,7 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
|||||||
ID: ID,
|
ID: ID,
|
||||||
config: config,
|
config: config,
|
||||||
cmd: nil,
|
cmd: nil,
|
||||||
|
cmdWaitChan: make(chan error, 1),
|
||||||
processLogger: processLogger,
|
processLogger: processLogger,
|
||||||
proxyLogger: proxyLogger,
|
proxyLogger: proxyLogger,
|
||||||
healthCheckTimeout: healthCheckTimeout,
|
healthCheckTimeout: healthCheckTimeout,
|
||||||
@@ -89,16 +93,17 @@ func (p *Process) swapState(expectedState, newState ProcessState) (ProcessState,
|
|||||||
defer p.stateMutex.Unlock()
|
defer p.stateMutex.Unlock()
|
||||||
|
|
||||||
if p.state != expectedState {
|
if p.state != expectedState {
|
||||||
|
p.proxyLogger.Warnf("swapState() Unexpected current state %s, expected %s", p.state, expectedState)
|
||||||
return p.state, ErrExpectedStateMismatch
|
return p.state, ErrExpectedStateMismatch
|
||||||
}
|
}
|
||||||
|
|
||||||
if !isValidTransition(p.state, newState) {
|
if !isValidTransition(p.state, newState) {
|
||||||
p.proxyLogger.Warnf("Invalid state transition from %s to %s", p.state, newState)
|
p.proxyLogger.Warnf("swapState() Invalid state transition from %s to %s", p.state, newState)
|
||||||
return p.state, ErrInvalidStateTransition
|
return p.state, ErrInvalidStateTransition
|
||||||
}
|
}
|
||||||
|
|
||||||
p.proxyLogger.Debugf("State transition from %s to %s", expectedState, newState)
|
|
||||||
p.state = newState
|
p.state = newState
|
||||||
|
p.proxyLogger.Debugf("swapState() State transitioned from %s to %s", expectedState, newState)
|
||||||
return p.state, nil
|
return p.state, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -179,6 +184,13 @@ func (p *Process) start() error {
|
|||||||
return fmt.Errorf("start() failed: %v", err)
|
return fmt.Errorf("start() failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Capture the exit error for later signaling
|
||||||
|
go func() {
|
||||||
|
exitErr := p.cmd.Wait()
|
||||||
|
p.proxyLogger.Debugf("cmd.Wait() returned for [%s] error: %v", p.ID, exitErr)
|
||||||
|
p.cmdWaitChan <- exitErr
|
||||||
|
}()
|
||||||
|
|
||||||
// One of three things can happen at this stage:
|
// One of three things can happen at this stage:
|
||||||
// 1. The command exits unexpectedly
|
// 1. The command exits unexpectedly
|
||||||
// 2. The health check fails
|
// 2. The health check fails
|
||||||
@@ -222,6 +234,22 @@ func (p *Process) start() error {
|
|||||||
}
|
}
|
||||||
case <-p.shutdownCtx.Done():
|
case <-p.shutdownCtx.Done():
|
||||||
return errors.New("health check interrupted due to shutdown")
|
return errors.New("health check interrupted due to shutdown")
|
||||||
|
case exitErr := <-p.cmdWaitChan:
|
||||||
|
if exitErr != nil {
|
||||||
|
p.proxyLogger.Warnf("upstream command exited prematurely with error: %v", exitErr)
|
||||||
|
if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
|
||||||
|
return fmt.Errorf("upstream command exited unexpectedly: %s AND state swap failed: %v, current state: %v", exitErr.Error(), err, curState)
|
||||||
|
} else {
|
||||||
|
return fmt.Errorf("upstream command exited unexpectedly: %s", exitErr.Error())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
p.proxyLogger.Warnf("upstream command exited prematurely with no error")
|
||||||
|
if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
|
||||||
|
return fmt.Errorf("upstream command exited prematurely with no error AND state swap failed: %v, current state: %v", err, curState)
|
||||||
|
} else {
|
||||||
|
return fmt.Errorf("upstream command exited prematurely with no error")
|
||||||
|
}
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
if err := p.checkHealthEndpoint(healthURL); err == nil {
|
if err := p.checkHealthEndpoint(healthURL); err == nil {
|
||||||
p.proxyLogger.Infof("Health check passed on %s", healthURL)
|
p.proxyLogger.Infof("Health check passed on %s", healthURL)
|
||||||
@@ -231,7 +259,7 @@ func (p *Process) start() error {
|
|||||||
if strings.Contains(err.Error(), "connection refused") {
|
if strings.Contains(err.Error(), "connection refused") {
|
||||||
endTime, _ := checkDeadline.Deadline()
|
endTime, _ := checkDeadline.Deadline()
|
||||||
ttl := time.Until(endTime)
|
ttl := time.Until(endTime)
|
||||||
p.proxyLogger.Infof("Connection refused on %s, retrying in %.0fs", healthURL, ttl.Seconds())
|
p.proxyLogger.Infof("Connection refused on %s, giving up in %.0fs", healthURL, ttl.Seconds())
|
||||||
} else {
|
} else {
|
||||||
p.proxyLogger.Infof("Health check error on %s, %v", healthURL, err)
|
p.proxyLogger.Infof("Health check error on %s, %v", healthURL, err)
|
||||||
}
|
}
|
||||||
@@ -257,7 +285,6 @@ func (p *Process) start() error {
|
|||||||
p.inFlightRequests.Wait()
|
p.inFlightRequests.Wait()
|
||||||
|
|
||||||
if time.Since(p.lastRequestHandled) > maxDuration {
|
if time.Since(p.lastRequestHandled) > maxDuration {
|
||||||
|
|
||||||
p.proxyLogger.Infof("Unloading model %s, TTL of %ds reached.", p.ID, p.config.UnloadAfter)
|
p.proxyLogger.Infof("Unloading model %s, TTL of %ds reached.", p.ID, p.config.UnloadAfter)
|
||||||
p.Stop()
|
p.Stop()
|
||||||
return
|
return
|
||||||
@@ -276,6 +303,7 @@ func (p *Process) start() error {
|
|||||||
func (p *Process) Stop() {
|
func (p *Process) Stop() {
|
||||||
// wait for any inflight requests before proceeding
|
// wait for any inflight requests before proceeding
|
||||||
p.inFlightRequests.Wait()
|
p.inFlightRequests.Wait()
|
||||||
|
p.proxyLogger.Debugf("Stopping process [%s]", p.ID)
|
||||||
|
|
||||||
// calling Stop() when state is invalid is a no-op
|
// calling Stop() when state is invalid is a no-op
|
||||||
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
|
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
|
||||||
@@ -303,14 +331,14 @@ func (p *Process) Shutdown() {
|
|||||||
// stopCommand will send a SIGTERM to the process and wait for it to exit.
|
// stopCommand will send a SIGTERM to the process and wait for it to exit.
|
||||||
// If it does not exit within 5 seconds, it will send a SIGKILL.
|
// If it does not exit within 5 seconds, it will send a SIGKILL.
|
||||||
func (p *Process) stopCommand(sigtermTTL time.Duration) {
|
func (p *Process) stopCommand(sigtermTTL time.Duration) {
|
||||||
|
stopStartTime := time.Now()
|
||||||
|
defer func() {
|
||||||
|
p.proxyLogger.Debugf("Process [%s] stopCommand took %v", p.ID, time.Since(stopStartTime))
|
||||||
|
}()
|
||||||
|
|
||||||
sigtermTimeout, cancelTimeout := context.WithTimeout(context.Background(), sigtermTTL)
|
sigtermTimeout, cancelTimeout := context.WithTimeout(context.Background(), sigtermTTL)
|
||||||
defer cancelTimeout()
|
defer cancelTimeout()
|
||||||
|
|
||||||
sigtermNormal := make(chan error, 1)
|
|
||||||
go func() {
|
|
||||||
sigtermNormal <- p.cmd.Wait()
|
|
||||||
}()
|
|
||||||
|
|
||||||
if p.cmd == nil || p.cmd.Process == nil {
|
if p.cmd == nil || p.cmd.Process == nil {
|
||||||
p.proxyLogger.Warnf("Process [%s] cmd or cmd.Process is nil", p.ID)
|
p.proxyLogger.Warnf("Process [%s] cmd or cmd.Process is nil", p.ID)
|
||||||
return
|
return
|
||||||
@@ -324,7 +352,11 @@ func (p *Process) stopCommand(sigtermTTL time.Duration) {
|
|||||||
case <-sigtermTimeout.Done():
|
case <-sigtermTimeout.Done():
|
||||||
p.proxyLogger.Infof("Process [%s] timed out waiting to stop, sending KILL signal", p.ID)
|
p.proxyLogger.Infof("Process [%s] timed out waiting to stop, sending KILL signal", p.ID)
|
||||||
p.cmd.Process.Kill()
|
p.cmd.Process.Kill()
|
||||||
case err := <-sigtermNormal:
|
case err := <-p.cmdWaitChan:
|
||||||
|
// Note: in start(), p.cmdWaitChan also has a select { ... }. That should be OK
|
||||||
|
// because if we make it here then the cmd has been successfully running and made it
|
||||||
|
// through the health check. There is a possibility that ithe cmd crashed after the health check
|
||||||
|
// succeeded but that's not a case llama-swap is handling for now.
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errno, ok := err.(syscall.Errno); ok {
|
if errno, ok := err.(syscall.Errno); ok {
|
||||||
p.proxyLogger.Errorf("Process [%s] errno >> %v", p.ID, errno)
|
p.proxyLogger.Errorf("Process [%s] errno >> %v", p.ID, errno)
|
||||||
@@ -369,6 +401,8 @@ func (p *Process) checkHealthEndpoint(healthURL string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
||||||
|
requestBeginTime := time.Now()
|
||||||
|
var startDuration time.Duration
|
||||||
|
|
||||||
// prevent new requests from being made while stopping or irrecoverable
|
// prevent new requests from being made while stopping or irrecoverable
|
||||||
currentState := p.CurrentState()
|
currentState := p.CurrentState()
|
||||||
@@ -385,11 +419,13 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
// start the process on demand
|
// start the process on demand
|
||||||
if p.CurrentState() != StateReady {
|
if p.CurrentState() != StateReady {
|
||||||
|
beginStartTime := time.Now()
|
||||||
if err := p.start(); err != nil {
|
if err := p.start(); err != nil {
|
||||||
errstr := fmt.Sprintf("unable to start process: %s", err)
|
errstr := fmt.Sprintf("unable to start process: %s", err)
|
||||||
http.Error(w, errstr, http.StatusBadGateway)
|
http.Error(w, errstr, http.StatusBadGateway)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
startDuration = time.Since(beginStartTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
proxyTo := p.config.Proxy
|
proxyTo := p.config.Proxy
|
||||||
@@ -433,4 +469,8 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
totalTime := time.Since(requestBeginTime)
|
||||||
|
p.proxyLogger.Debugf("Process [%s] request %s - start: %v, total: %v",
|
||||||
|
p.ID, r.RequestURI, startDuration, totalTime)
|
||||||
}
|
}
|
||||||
|
|||||||
+39
-10
@@ -2,9 +2,9 @@ package proxy
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@@ -13,16 +13,25 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
discardLogger = NewLogMonitorWriter(io.Discard)
|
debugLogger = NewLogMonitorWriter(os.Stdout)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// flip to help with debugging tests
|
||||||
|
if false {
|
||||||
|
debugLogger.SetLogLevel(LevelDebug)
|
||||||
|
} else {
|
||||||
|
debugLogger.SetLogLevel(LevelError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestProcess_AutomaticallyStartsUpstream(t *testing.T) {
|
func TestProcess_AutomaticallyStartsUpstream(t *testing.T) {
|
||||||
|
|
||||||
expectedMessage := "testing91931"
|
expectedMessage := "testing91931"
|
||||||
config := getTestSimpleResponderConfig(expectedMessage)
|
config := getTestSimpleResponderConfig(expectedMessage)
|
||||||
|
|
||||||
// Create a process
|
// Create a process
|
||||||
process := NewProcess("test-process", 5, config, discardLogger, discardLogger)
|
process := NewProcess("test-process", 5, config, debugLogger, debugLogger)
|
||||||
defer process.Stop()
|
defer process.Stop()
|
||||||
|
|
||||||
req := httptest.NewRequest("GET", "/test", nil)
|
req := httptest.NewRequest("GET", "/test", nil)
|
||||||
@@ -58,7 +67,7 @@ func TestProcess_WaitOnMultipleStarts(t *testing.T) {
|
|||||||
expectedMessage := "testing91931"
|
expectedMessage := "testing91931"
|
||||||
config := getTestSimpleResponderConfig(expectedMessage)
|
config := getTestSimpleResponderConfig(expectedMessage)
|
||||||
|
|
||||||
process := NewProcess("test-process", 5, config, discardLogger, discardLogger)
|
process := NewProcess("test-process", 5, config, debugLogger, debugLogger)
|
||||||
defer process.Stop()
|
defer process.Stop()
|
||||||
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
@@ -86,7 +95,7 @@ func TestProcess_BrokenModelConfig(t *testing.T) {
|
|||||||
CheckEndpoint: "/health",
|
CheckEndpoint: "/health",
|
||||||
}
|
}
|
||||||
|
|
||||||
process := NewProcess("broken", 1, config, discardLogger, discardLogger)
|
process := NewProcess("broken", 1, config, debugLogger, debugLogger)
|
||||||
|
|
||||||
req := httptest.NewRequest("GET", "/", nil)
|
req := httptest.NewRequest("GET", "/", nil)
|
||||||
w := httptest.NewRecorder()
|
w := httptest.NewRecorder()
|
||||||
@@ -111,7 +120,7 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
|
|||||||
config.UnloadAfter = 3 // seconds
|
config.UnloadAfter = 3 // seconds
|
||||||
assert.Equal(t, 3, config.UnloadAfter)
|
assert.Equal(t, 3, config.UnloadAfter)
|
||||||
|
|
||||||
process := NewProcess("ttl_test", 2, config, discardLogger, discardLogger)
|
process := NewProcess("ttl_test", 2, config, debugLogger, debugLogger)
|
||||||
defer process.Stop()
|
defer process.Stop()
|
||||||
|
|
||||||
// this should take 4 seconds
|
// this should take 4 seconds
|
||||||
@@ -153,7 +162,7 @@ func TestProcess_LowTTLValue(t *testing.T) {
|
|||||||
config.UnloadAfter = 1 // second
|
config.UnloadAfter = 1 // second
|
||||||
assert.Equal(t, 1, config.UnloadAfter)
|
assert.Equal(t, 1, config.UnloadAfter)
|
||||||
|
|
||||||
process := NewProcess("ttl", 2, config, discardLogger, discardLogger)
|
process := NewProcess("ttl", 2, config, debugLogger, debugLogger)
|
||||||
defer process.Stop()
|
defer process.Stop()
|
||||||
|
|
||||||
for i := 0; i < 100; i++ {
|
for i := 0; i < 100; i++ {
|
||||||
@@ -180,7 +189,7 @@ func TestProcess_HTTPRequestsHaveTimeToFinish(t *testing.T) {
|
|||||||
|
|
||||||
expectedMessage := "12345"
|
expectedMessage := "12345"
|
||||||
config := getTestSimpleResponderConfig(expectedMessage)
|
config := getTestSimpleResponderConfig(expectedMessage)
|
||||||
process := NewProcess("t", 10, config, discardLogger, discardLogger)
|
process := NewProcess("t", 10, config, debugLogger, debugLogger)
|
||||||
defer process.Stop()
|
defer process.Stop()
|
||||||
|
|
||||||
results := map[string]string{
|
results := map[string]string{
|
||||||
@@ -257,7 +266,7 @@ func TestProcess_SwapState(t *testing.T) {
|
|||||||
|
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
t.Run(test.name, func(t *testing.T) {
|
t.Run(test.name, func(t *testing.T) {
|
||||||
p := NewProcess("test", 10, getTestSimpleResponderConfig("test"), discardLogger, discardLogger)
|
p := NewProcess("test", 10, getTestSimpleResponderConfig("test"), debugLogger, debugLogger)
|
||||||
p.state = test.currentState
|
p.state = test.currentState
|
||||||
|
|
||||||
resultState, err := p.swapState(test.expectedState, test.newState)
|
resultState, err := p.swapState(test.expectedState, test.newState)
|
||||||
@@ -290,7 +299,7 @@ func TestProcess_ShutdownInterruptsHealthCheck(t *testing.T) {
|
|||||||
config.Proxy = "http://localhost:9998/test"
|
config.Proxy = "http://localhost:9998/test"
|
||||||
|
|
||||||
healthCheckTTLSeconds := 30
|
healthCheckTTLSeconds := 30
|
||||||
process := NewProcess("test-process", healthCheckTTLSeconds, config, discardLogger, discardLogger)
|
process := NewProcess("test-process", healthCheckTTLSeconds, config, debugLogger, debugLogger)
|
||||||
|
|
||||||
// make it a lot faster
|
// make it a lot faster
|
||||||
process.healthCheckLoopInterval = time.Second
|
process.healthCheckLoopInterval = time.Second
|
||||||
@@ -311,3 +320,23 @@ func TestProcess_ShutdownInterruptsHealthCheck(t *testing.T) {
|
|||||||
assert.ErrorContains(t, err, "health check interrupted due to shutdown")
|
assert.ErrorContains(t, err, "health check interrupted due to shutdown")
|
||||||
assert.Equal(t, StateShutdown, process.CurrentState())
|
assert.Equal(t, StateShutdown, process.CurrentState())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestProcess_ExitInterruptsHealthCheck(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("skipping Exit Interrupts Health Check test")
|
||||||
|
}
|
||||||
|
|
||||||
|
// should run and exit but interrupt the long checkHealthTimeout
|
||||||
|
checkHealthTimeout := 5
|
||||||
|
config := ModelConfig{
|
||||||
|
Cmd: "sleep 1",
|
||||||
|
Proxy: "http://127.0.0.1:9913",
|
||||||
|
CheckEndpoint: "/health",
|
||||||
|
}
|
||||||
|
|
||||||
|
process := NewProcess("sleepy", checkHealthTimeout, config, debugLogger, debugLogger)
|
||||||
|
process.healthCheckLoopInterval = time.Second // make it faster
|
||||||
|
err := process.start()
|
||||||
|
assert.Equal(t, "upstream command exited prematurely with no error", err.Error())
|
||||||
|
assert.Equal(t, process.CurrentState(), StateFailed)
|
||||||
|
}
|
||||||
|
|||||||
@@ -49,14 +49,19 @@ func New(config *Config) *ProxyManager {
|
|||||||
switch strings.ToLower(strings.TrimSpace(config.LogLevel)) {
|
switch strings.ToLower(strings.TrimSpace(config.LogLevel)) {
|
||||||
case "debug":
|
case "debug":
|
||||||
proxyLogger.SetLogLevel(LevelDebug)
|
proxyLogger.SetLogLevel(LevelDebug)
|
||||||
|
upstreamLogger.SetLogLevel(LevelDebug)
|
||||||
case "info":
|
case "info":
|
||||||
proxyLogger.SetLogLevel(LevelInfo)
|
proxyLogger.SetLogLevel(LevelInfo)
|
||||||
|
upstreamLogger.SetLogLevel(LevelInfo)
|
||||||
case "warn":
|
case "warn":
|
||||||
proxyLogger.SetLogLevel(LevelWarn)
|
proxyLogger.SetLogLevel(LevelWarn)
|
||||||
|
upstreamLogger.SetLogLevel(LevelWarn)
|
||||||
case "error":
|
case "error":
|
||||||
proxyLogger.SetLogLevel(LevelError)
|
proxyLogger.SetLogLevel(LevelError)
|
||||||
|
upstreamLogger.SetLogLevel(LevelError)
|
||||||
default:
|
default:
|
||||||
proxyLogger.SetLogLevel(LevelInfo)
|
proxyLogger.SetLogLevel(LevelInfo)
|
||||||
|
upstreamLogger.SetLogLevel(LevelInfo)
|
||||||
}
|
}
|
||||||
|
|
||||||
pm := &ProxyManager{
|
pm := &ProxyManager{
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
|
|||||||
"model1": getTestSimpleResponderConfig("model1"),
|
"model1": getTestSimpleResponderConfig("model1"),
|
||||||
"model2": getTestSimpleResponderConfig("model2"),
|
"model2": getTestSimpleResponderConfig("model2"),
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -62,6 +63,7 @@ func TestProxyManager_SwapMultiProcess(t *testing.T) {
|
|||||||
Profiles: map[string][]string{
|
Profiles: map[string][]string{
|
||||||
"test": {model1, model2},
|
"test": {model1, model2},
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -103,6 +105,7 @@ func TestProxyManager_SwapMultiProcessParallelRequests(t *testing.T) {
|
|||||||
"model2": getTestSimpleResponderConfig("model2"),
|
"model2": getTestSimpleResponderConfig("model2"),
|
||||||
"model3": getTestSimpleResponderConfig("model3"),
|
"model3": getTestSimpleResponderConfig("model3"),
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -153,6 +156,7 @@ func TestProxyManager_ListModelsHandler(t *testing.T) {
|
|||||||
"model2": getTestSimpleResponderConfig("model2"),
|
"model2": getTestSimpleResponderConfig("model2"),
|
||||||
"model3": getTestSimpleResponderConfig("model3"),
|
"model3": getTestSimpleResponderConfig("model3"),
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -230,6 +234,7 @@ func TestProxyManager_ProfileNonMember(t *testing.T) {
|
|||||||
Profiles: map[string][]string{
|
Profiles: map[string][]string{
|
||||||
"test": {model1},
|
"test": {model1},
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -278,6 +283,7 @@ func TestProxyManager_Shutdown(t *testing.T) {
|
|||||||
"model2": model2Config,
|
"model2": model2Config,
|
||||||
"model3": model3Config,
|
"model3": model3Config,
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -313,6 +319,7 @@ func TestProxyManager_Unload(t *testing.T) {
|
|||||||
Models: map[string]ModelConfig{
|
Models: map[string]ModelConfig{
|
||||||
"model1": getTestSimpleResponderConfig("model1"),
|
"model1": getTestSimpleResponderConfig("model1"),
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -339,6 +346,7 @@ func TestProxyManager_StripProfileSlug(t *testing.T) {
|
|||||||
Models: map[string]ModelConfig{
|
Models: map[string]ModelConfig{
|
||||||
"TheExpectedModel": getTestSimpleResponderConfig("TheExpectedModel"),
|
"TheExpectedModel": getTestSimpleResponderConfig("TheExpectedModel"),
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -365,6 +373,7 @@ func TestProxyManager_RunningEndpoint(t *testing.T) {
|
|||||||
Profiles: map[string][]string{
|
Profiles: map[string][]string{
|
||||||
"test": {"model1", "model2"},
|
"test": {"model1", "model2"},
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
// Define a helper struct to parse the JSON response.
|
// Define a helper struct to parse the JSON response.
|
||||||
@@ -472,6 +481,7 @@ func TestProxyManager_AudioTranscriptionHandler(t *testing.T) {
|
|||||||
Models: map[string]ModelConfig{
|
Models: map[string]ModelConfig{
|
||||||
"TheExpectedModel": getTestSimpleResponderConfig("TheExpectedModel"),
|
"TheExpectedModel": getTestSimpleResponderConfig("TheExpectedModel"),
|
||||||
},
|
},
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -580,6 +590,8 @@ func TestProxyManager_UseModelName(t *testing.T) {
|
|||||||
Models: map[string]ModelConfig{
|
Models: map[string]ModelConfig{
|
||||||
"model1": modelConfig,
|
"model1": modelConfig,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy := New(config)
|
proxy := New(config)
|
||||||
@@ -647,7 +659,7 @@ func TestProxyManager_CORSOptionsHandler(t *testing.T) {
|
|||||||
Models: map[string]ModelConfig{
|
Models: map[string]ModelConfig{
|
||||||
"model1": getTestSimpleResponderConfig("model1"),
|
"model1": getTestSimpleResponderConfig("model1"),
|
||||||
},
|
},
|
||||||
LogRequests: true,
|
LogLevel: "error",
|
||||||
}
|
}
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user