Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fb7c808082 | |||
| a7e640b0f7 |
+1
-1
@@ -3,7 +3,7 @@
|
|||||||
healthCheckTimeout: 90
|
healthCheckTimeout: 90
|
||||||
|
|
||||||
# valid log levels: debug, info (default), warn, error
|
# valid log levels: debug, info (default), warn, error
|
||||||
logLevel: info
|
logLevel: debug
|
||||||
|
|
||||||
models:
|
models:
|
||||||
"llama":
|
"llama":
|
||||||
|
|||||||
@@ -0,0 +1,153 @@
|
|||||||
|
# aider, QwQ, Qwen-Coder 2.5 and llama-swap
|
||||||
|
|
||||||
|
This guide show how to use aider and llama-swap to get a 100% local coding co-pilot setup. The focus is on the trickest part which is configuring aider, llama-swap and llama-server to work together.
|
||||||
|
|
||||||
|
## Here's what you you need:
|
||||||
|
|
||||||
|
- aider - [installation docs](https://aider.chat/docs/install.html)
|
||||||
|
- llama-server - [download latest release](https://github.com/ggml-org/llama.cpp/releases)
|
||||||
|
- llama-swap - [download latest release](https://github.com/mostlygeek/llama-swap/releases)
|
||||||
|
- [QwQ 32B](https://huggingface.co/bartowski/Qwen_QwQ-32B-GGUF) and [Qwen Coder 2.5 32B](https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF) models
|
||||||
|
- 24GB VRAM video card
|
||||||
|
|
||||||
|
## Running aider
|
||||||
|
|
||||||
|
The goal is getting this command line to work:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
aider --architect \
|
||||||
|
--no-show-model-warnings \
|
||||||
|
--model openai/QwQ \
|
||||||
|
--editor-model openai/qwen-coder-32B \
|
||||||
|
--model-settings-file aider.model.settings.yml \
|
||||||
|
--openai-api-key "sk-na" \
|
||||||
|
--openai-api-base "http://10.0.1.24:8080/v1" \
|
||||||
|
```
|
||||||
|
|
||||||
|
Set `--openai-api-base` to the IP and port where your llama-swap is running.
|
||||||
|
|
||||||
|
## Create an aider model settings file
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# aider.model.settings.yml
|
||||||
|
|
||||||
|
#
|
||||||
|
# !!! important: model names must match llama-swap configuration names !!!
|
||||||
|
#
|
||||||
|
|
||||||
|
- name: "openai/QwQ"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.95
|
||||||
|
top_k: 40
|
||||||
|
presence_penalty: 0.1
|
||||||
|
repetition_penalty: 1
|
||||||
|
num_ctx: 16384
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
weak_model_name: "openai/qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
|
||||||
|
- name: "openai/qwen-coder-32B"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
repetition_penalty: 1.05
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
editor_edit_format: editor-diff
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
```
|
||||||
|
|
||||||
|
## llama-swap configuration
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# config.yaml
|
||||||
|
|
||||||
|
# The parameters are tweaked to fit model+context into 24GB VRAM GPUs
|
||||||
|
models:
|
||||||
|
"qwen-coder-32B":
|
||||||
|
proxy: "http://127.0.0.1:8999"
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 8999 --flash-attn --slots
|
||||||
|
--ctx-size 16000
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
-ngl 99
|
||||||
|
--model /path/to/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf
|
||||||
|
|
||||||
|
"QwQ":
|
||||||
|
proxy: "http://127.0.0.1:9503"
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 9503 --flash-attn --metrics--slots
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
--ctx-size 32000
|
||||||
|
--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
|
||||||
|
--temp 0.6 --repeat-penalty 1.1 --dry-multiplier 0.5
|
||||||
|
--min-p 0.01 --top-k 40 --top-p 0.95
|
||||||
|
-ngl 99
|
||||||
|
--model /mnt/nvme/models/bartowski/Qwen_QwQ-32B-Q4_K_M.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced, Dual GPU Configuration
|
||||||
|
|
||||||
|
If you have _dual 24GB GPUs_ you can use llama-swap profiles to avoid swapping between QwQ and Qwen Coder.
|
||||||
|
|
||||||
|
In llama-swap's configuration file:
|
||||||
|
|
||||||
|
1. add a `profiles` section with `aider` as the profile name
|
||||||
|
2. using the `env` field to specify the GPU IDs for each model
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# config.yaml
|
||||||
|
|
||||||
|
# Add a profile for aider
|
||||||
|
profiles:
|
||||||
|
aider:
|
||||||
|
- qwen-coder-32B
|
||||||
|
- QwQ
|
||||||
|
|
||||||
|
models:
|
||||||
|
"qwen-coder-32B":
|
||||||
|
# manually set the GPU to run on
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=0"
|
||||||
|
proxy: "http://127.0.0.1:8999"
|
||||||
|
cmd: /path/to/llama-server ...
|
||||||
|
|
||||||
|
"QwQ":
|
||||||
|
# manually set the GPU to run on
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=1"
|
||||||
|
proxy: "http://127.0.0.1:9503"
|
||||||
|
cmd: /path/to/llama-server ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Append the profile tag, `aider:`, to the model names in the model settings file
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# aider.model.settings.yml
|
||||||
|
- name: "openai/aider:QwQ"
|
||||||
|
weak_model_name: "openai/aider:qwen-coder-32B-aider"
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B-aider"
|
||||||
|
|
||||||
|
- name: "openai/aider:qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B-aider"
|
||||||
|
```
|
||||||
|
|
||||||
|
Run aider with:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ aider --architect \
|
||||||
|
--no-show-model-warnings \
|
||||||
|
--model openai/aider:QwQ \
|
||||||
|
--editor-model openai/aider:qwen-coder-32B \
|
||||||
|
--config aider.conf.yml \
|
||||||
|
--model-settings-file aider.model.settings.yml
|
||||||
|
--openai-api-key "sk-na" \
|
||||||
|
--openai-api-base "http://10.0.1.24:8080/v1"
|
||||||
|
```
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
# this makes use of llama-swap's profile feature to
|
||||||
|
# keep the architect and editor models in VRAM on different GPUs
|
||||||
|
|
||||||
|
- name: "openai/aider:QwQ"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.95
|
||||||
|
top_k: 40
|
||||||
|
presence_penalty: 0.1
|
||||||
|
repetition_penalty: 1
|
||||||
|
num_ctx: 16384
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
weak_model_name: "openai/aider:qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B"
|
||||||
|
|
||||||
|
- name: "openai/aider:qwen-coder-32B"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
repetition_penalty: 1.05
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
editor_edit_format: editor-diff
|
||||||
|
editor_model_name: "openai/aider:qwen-coder-32B"
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
- name: "openai/QwQ"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.95
|
||||||
|
top_k: 40
|
||||||
|
presence_penalty: 0.1
|
||||||
|
repetition_penalty: 1
|
||||||
|
num_ctx: 16384
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
weak_model_name: "openai/qwen-coder-32B"
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
|
||||||
|
- name: "openai/qwen-coder-32B"
|
||||||
|
edit_format: diff
|
||||||
|
extra_params:
|
||||||
|
max_tokens: 16384
|
||||||
|
top_p: 0.8
|
||||||
|
top_k: 20
|
||||||
|
repetition_penalty: 1.05
|
||||||
|
use_temperature: 0.6
|
||||||
|
reasoning_tag: think
|
||||||
|
editor_edit_format: editor-diff
|
||||||
|
editor_model_name: "openai/qwen-coder-32B"
|
||||||
|
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
healthCheckTimeout: 300
|
||||||
|
logLevel: debug
|
||||||
|
|
||||||
|
profiles:
|
||||||
|
aider:
|
||||||
|
- qwen-coder-32B
|
||||||
|
- QwQ
|
||||||
|
|
||||||
|
models:
|
||||||
|
"qwen-coder-32B":
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=0"
|
||||||
|
aliases:
|
||||||
|
- coder
|
||||||
|
proxy: "http://127.0.0.1:8999"
|
||||||
|
|
||||||
|
# set appropriate paths for your environment
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 8999 --flash-attn --slots
|
||||||
|
--ctx-size 16000
|
||||||
|
--ctx-size-draft 16000
|
||||||
|
--model /path/to/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf
|
||||||
|
--model-draft /path/to/Qwen2.5-Coder-1.5B-Instruct-Q8_0.gguf
|
||||||
|
-ngl 99 -ngld 99
|
||||||
|
--draft-max 16 --draft-min 4 --draft-p-min 0.4
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
"QwQ":
|
||||||
|
env:
|
||||||
|
- "CUDA_VISIBLE_DEVICES=1"
|
||||||
|
proxy: "http://127.0.0.1:9503"
|
||||||
|
|
||||||
|
# set appropriate paths for your environment
|
||||||
|
cmd: >
|
||||||
|
/path/to/llama-server
|
||||||
|
--host 127.0.0.1 --port 9503
|
||||||
|
--flash-attn --metrics
|
||||||
|
--slots
|
||||||
|
--model /path/to/Qwen_QwQ-32B-Q4_K_M.gguf
|
||||||
|
--cache-type-k q8_0 --cache-type-v q8_0
|
||||||
|
--ctx-size 32000
|
||||||
|
--samplers "top_k;top_p;min_p;temperature;dry;typ_p;xtc"
|
||||||
|
--temp 0.6
|
||||||
|
--repeat-penalty 1.1
|
||||||
|
--dry-multiplier 0.5
|
||||||
|
--min-p 0.01
|
||||||
|
--top-k 40
|
||||||
|
--top-p 0.95
|
||||||
|
-ngl 99 -ngld 99
|
||||||
@@ -303,6 +303,11 @@ func (p *Process) Shutdown() {
|
|||||||
// stopCommand will send a SIGTERM to the process and wait for it to exit.
|
// stopCommand will send a SIGTERM to the process and wait for it to exit.
|
||||||
// If it does not exit within 5 seconds, it will send a SIGKILL.
|
// If it does not exit within 5 seconds, it will send a SIGKILL.
|
||||||
func (p *Process) stopCommand(sigtermTTL time.Duration) {
|
func (p *Process) stopCommand(sigtermTTL time.Duration) {
|
||||||
|
stopStartTime := time.Now()
|
||||||
|
defer func() {
|
||||||
|
p.proxyLogger.Debugf("Process [%s] stopCommand took %v", p.ID, time.Since(stopStartTime))
|
||||||
|
}()
|
||||||
|
|
||||||
sigtermTimeout, cancelTimeout := context.WithTimeout(context.Background(), sigtermTTL)
|
sigtermTimeout, cancelTimeout := context.WithTimeout(context.Background(), sigtermTTL)
|
||||||
defer cancelTimeout()
|
defer cancelTimeout()
|
||||||
|
|
||||||
@@ -369,6 +374,8 @@ func (p *Process) checkHealthEndpoint(healthURL string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
||||||
|
requestBeginTime := time.Now()
|
||||||
|
var startDuration time.Duration
|
||||||
|
|
||||||
// prevent new requests from being made while stopping or irrecoverable
|
// prevent new requests from being made while stopping or irrecoverable
|
||||||
currentState := p.CurrentState()
|
currentState := p.CurrentState()
|
||||||
@@ -385,11 +392,13 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
// start the process on demand
|
// start the process on demand
|
||||||
if p.CurrentState() != StateReady {
|
if p.CurrentState() != StateReady {
|
||||||
|
beginStartTime := time.Now()
|
||||||
if err := p.start(); err != nil {
|
if err := p.start(); err != nil {
|
||||||
errstr := fmt.Sprintf("unable to start process: %s", err)
|
errstr := fmt.Sprintf("unable to start process: %s", err)
|
||||||
http.Error(w, errstr, http.StatusBadGateway)
|
http.Error(w, errstr, http.StatusBadGateway)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
startDuration = time.Since(beginStartTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
proxyTo := p.config.Proxy
|
proxyTo := p.config.Proxy
|
||||||
@@ -433,4 +442,8 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
totalTime := time.Since(requestBeginTime)
|
||||||
|
p.proxyLogger.Debugf("Process [%s] request %s - start: %v, total: %v",
|
||||||
|
p.ID, r.RequestURI, startDuration, totalTime)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user