Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f58c8c8ec5 | |||
| 954e2dee73 | |||
| a533aec736 | |||
| 97b17fc47d | |||
| 2457840698 | |||
| 7f55494151 | |||
| 831a90d3b0 | |||
| 977f1856bb | |||
| 52b329f7bc | |||
| 57803fd3aa | |||
| c55d0cc842 | |||
| 7acbaf4712 |
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
||||||
|
|
||||||
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary or use the provided docker images.
|
Written in golang, it is very easy to install (single binary with no dependencies) and configure (single yaml file). To get started, download a pre-built binary, a provided docker images or Homebrew.
|
||||||
|
|
||||||
## Features:
|
## Features:
|
||||||
|
|
||||||
@@ -18,9 +18,12 @@ Written in golang, it is very easy to install (single binary with no dependencie
|
|||||||
- `v1/completions`
|
- `v1/completions`
|
||||||
- `v1/chat/completions`
|
- `v1/chat/completions`
|
||||||
- `v1/embeddings`
|
- `v1/embeddings`
|
||||||
- `v1/rerank`, `v1/reranking`, `rerank`
|
|
||||||
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
||||||
- `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
|
- `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867))
|
||||||
|
- ✅ llama-server (llama.cpp) supported endpoints:
|
||||||
|
- `v1/rerank`, `v1/reranking`, `/rerank`
|
||||||
|
- `/infill` - for code infilling
|
||||||
|
- `/completion` - for completion endpoint
|
||||||
- ✅ llama-swap custom API endpoints
|
- ✅ llama-swap custom API endpoints
|
||||||
- `/ui` - web UI
|
- `/ui` - web UI
|
||||||
- `/log` - remote log monitoring
|
- `/log` - remote log monitoring
|
||||||
@@ -31,7 +34,7 @@ Written in golang, it is very easy to install (single binary with no dependencie
|
|||||||
- ✅ Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
|
- ✅ Run multiple models at once with `Groups` ([#107](https://github.com/mostlygeek/llama-swap/issues/107))
|
||||||
- ✅ Automatic unloading of models after timeout by setting a `ttl`
|
- ✅ Automatic unloading of models after timeout by setting a `ttl`
|
||||||
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
|
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
|
||||||
- ✅ Reliable Docker and Podman support with `cmdStart` and `cmdStop`
|
- ✅ Reliable Docker and Podman support using `cmd` and `cmdStop` together
|
||||||
- ✅ Full control over server settings per model
|
- ✅ Full control over server settings per model
|
||||||
- ✅ Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
|
- ✅ Preload models on startup with `hooks` ([#235](https://github.com/mostlygeek/llama-swap/pull/235))
|
||||||
|
|
||||||
@@ -204,4 +207,7 @@ For Python based inference servers like vllm or tabbyAPI it is recommended to ru
|
|||||||
|
|
||||||
## Star History
|
## Star History
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> ⭐️ Star this project to help others discover it!
|
||||||
|
|
||||||
[](https://www.star-history.com/#mostlygeek/llama-swap&Date)
|
[](https://www.star-history.com/#mostlygeek/llama-swap&Date)
|
||||||
|
|||||||
+15
-6
@@ -49,8 +49,8 @@ macros:
|
|||||||
# - required
|
# - required
|
||||||
# - each key is the model's ID, used in API requests
|
# - each key is the model's ID, used in API requests
|
||||||
# - model settings have default values that are used if they are not defined here
|
# - model settings have default values that are used if they are not defined here
|
||||||
# - below are examples of the various settings a model can have:
|
# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
|
||||||
# - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted
|
# - below are examples of the all the settings a model can have
|
||||||
models:
|
models:
|
||||||
|
|
||||||
# keys are the model names used in API requests
|
# keys are the model names used in API requests
|
||||||
@@ -129,6 +129,15 @@ models:
|
|||||||
# - recommended to stick to sampling parameters
|
# - recommended to stick to sampling parameters
|
||||||
strip_params: "temperature, top_p, top_k"
|
strip_params: "temperature, top_p, top_k"
|
||||||
|
|
||||||
|
# concurrencyLimit: overrides the allowed number of active parallel requests to a model
|
||||||
|
# - optional, default: 0
|
||||||
|
# - useful for limiting the number of active parallel requests a model can process
|
||||||
|
# - must be set per model
|
||||||
|
# - any number greater than 0 will override the internal default value of 10
|
||||||
|
# - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
|
||||||
|
# - recommended to be omitted and the default used
|
||||||
|
concurrencyLimit: 0
|
||||||
|
|
||||||
# Unlisted model example:
|
# Unlisted model example:
|
||||||
"qwen-unlisted":
|
"qwen-unlisted":
|
||||||
# unlisted: boolean, true or false
|
# unlisted: boolean, true or false
|
||||||
@@ -139,12 +148,12 @@ models:
|
|||||||
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||||
|
|
||||||
# Docker example:
|
# Docker example:
|
||||||
# container run times like Docker and Podman can be used reliably with a
|
# container runtimes like Docker and Podman can be used reliably with
|
||||||
# a combination of cmd and cmdStop.
|
# a combination of cmd, cmdStop, and ${MODEL_ID}
|
||||||
"docker-llama":
|
"docker-llama":
|
||||||
proxy: "http://127.0.0.1:${PORT}"
|
proxy: "http://127.0.0.1:${PORT}"
|
||||||
cmd: |
|
cmd: |
|
||||||
docker run --name dockertest
|
docker run --name ${MODEL_ID}
|
||||||
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
|
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
|
||||||
ghcr.io/ggml-org/llama.cpp:server
|
ghcr.io/ggml-org/llama.cpp:server
|
||||||
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
@@ -158,7 +167,7 @@ models:
|
|||||||
# - on POSIX systems: a SIGTERM signal is sent
|
# - on POSIX systems: a SIGTERM signal is sent
|
||||||
# - on Windows, calls taskkill to stop the process
|
# - on Windows, calls taskkill to stop the process
|
||||||
# - processes have 5 seconds to shutdown until forceful termination is attempted
|
# - processes have 5 seconds to shutdown until forceful termination is attempted
|
||||||
cmdStop: docker stop dockertest
|
cmdStop: docker stop ${MODEL_ID}
|
||||||
|
|
||||||
# groups: a dictionary of group settings
|
# groups: a dictionary of group settings
|
||||||
# - optional, default: empty dictionary
|
# - optional, default: empty dictionary
|
||||||
|
|||||||
@@ -153,6 +153,19 @@ func main() {
|
|||||||
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// llama-server compatibility: /completion
|
||||||
|
r.POST("/completion", func(c *gin.Context) {
|
||||||
|
c.Header("Content-Type", "application/json")
|
||||||
|
c.JSON(http.StatusOK, gin.H{
|
||||||
|
"responseMessage": *responseMessage,
|
||||||
|
"usage": gin.H{
|
||||||
|
"completion_tokens": 10,
|
||||||
|
"prompt_tokens": 25,
|
||||||
|
"total_tokens": 35,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
// issue #41
|
// issue #41
|
||||||
r.POST("/v1/audio/transcriptions", func(c *gin.Context) {
|
r.POST("/v1/audio/transcriptions", func(c *gin.Context) {
|
||||||
// Parse the multipart form
|
// Parse the multipart form
|
||||||
|
|||||||
+7
-1
@@ -237,7 +237,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
|||||||
|
|
||||||
- name must fit the regex ^[a-zA-Z0-9_-]+$
|
- name must fit the regex ^[a-zA-Z0-9_-]+$
|
||||||
- names must be less than 64 characters (no reason, just cause)
|
- names must be less than 64 characters (no reason, just cause)
|
||||||
- name can not be any reserved macros: PORT
|
- name can not be any reserved macros: PORT, MODEL_ID
|
||||||
- macro values must be less than 1024 characters
|
- macro values must be less than 1024 characters
|
||||||
*/
|
*/
|
||||||
macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
|
macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
|
||||||
@@ -253,6 +253,7 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
|||||||
}
|
}
|
||||||
switch macroName {
|
switch macroName {
|
||||||
case "PORT":
|
case "PORT":
|
||||||
|
case "MODEL_ID":
|
||||||
return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
|
return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -296,6 +297,11 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
|||||||
nextPort++
|
nextPort++
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if strings.Contains(modelConfig.Cmd, "${MODEL_ID}") || strings.Contains(modelConfig.CmdStop, "${MODEL_ID}") {
|
||||||
|
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${MODEL_ID}", modelId)
|
||||||
|
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${MODEL_ID}", modelId)
|
||||||
|
}
|
||||||
|
|
||||||
// make sure there are no unknown macros that have not been replaced
|
// make sure there are no unknown macros that have not been replaced
|
||||||
macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
|
macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
|
||||||
fieldMap := map[string]string{
|
fieldMap := map[string]string{
|
||||||
|
|||||||
@@ -440,3 +440,44 @@ models:
|
|||||||
expectedCmd := "/user/llama.cpp/build/bin/llama-server --port 9990 --model /path/to/model.gguf -ngl 99"
|
expectedCmd := "/user/llama.cpp/build/bin/llama-server --port 9990 --model /path/to/model.gguf -ngl 99"
|
||||||
assert.Equal(t, expectedCmd, cmdStr, "Final command does not match expected structure")
|
assert.Equal(t, expectedCmd, cmdStr, "Final command does not match expected structure")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestConfig_MacroModelId(t *testing.T) {
|
||||||
|
content := `
|
||||||
|
startPort: 9000
|
||||||
|
macros:
|
||||||
|
"docker-llama": docker run --name ${MODEL_ID} -p ${PORT}:8080 docker_img
|
||||||
|
"docker-stop": docker stop ${MODEL_ID}
|
||||||
|
|
||||||
|
models:
|
||||||
|
model1:
|
||||||
|
cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
|
||||||
|
|
||||||
|
model2:
|
||||||
|
cmd: ${docker-llama}
|
||||||
|
cmdStop: ${docker-stop}
|
||||||
|
|
||||||
|
author/model:F16:
|
||||||
|
cmd: /path/to/server -p ${PORT} -hf ${MODEL_ID}
|
||||||
|
cmdStop: stop
|
||||||
|
`
|
||||||
|
|
||||||
|
config, err := LoadConfigFromReader(strings.NewReader(content))
|
||||||
|
assert.NoError(t, err)
|
||||||
|
sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, "/path/to/server -p 9001 -hf model1", strings.Join(sanitizedCmd, " "))
|
||||||
|
|
||||||
|
assert.Equal(t, "docker stop ${MODEL_ID}", config.Macros["docker-stop"])
|
||||||
|
|
||||||
|
sanitizedCmd2, err := SanitizeCommand(config.Models["model2"].Cmd)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, "docker run --name model2 -p 9002:8080 docker_img", strings.Join(sanitizedCmd2, " "))
|
||||||
|
|
||||||
|
sanitizedCmdStop, err := SanitizeCommand(config.Models["model2"].CmdStop)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, "docker stop model2", strings.Join(sanitizedCmdStop, " "))
|
||||||
|
|
||||||
|
sanitizedCmd3, err := SanitizeCommand(config.Models["author/model:F16"].Cmd)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, "/path/to/server -p 9000 -hf author/model:F16", strings.Join(sanitizedCmd3, " "))
|
||||||
|
}
|
||||||
|
|||||||
+33
-22
@@ -5,12 +5,20 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/tidwall/gjson"
|
"github.com/tidwall/gjson"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type MetricsRecorder struct {
|
||||||
|
metricsMonitor *MetricsMonitor
|
||||||
|
realModelName string
|
||||||
|
// isStreaming bool
|
||||||
|
startTime time.Time
|
||||||
|
}
|
||||||
|
|
||||||
// MetricsMiddleware sets up the MetricsResponseWriter for capturing upstream requests
|
// MetricsMiddleware sets up the MetricsResponseWriter for capturing upstream requests
|
||||||
func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
|
func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
|
||||||
return func(c *gin.Context) {
|
return func(c *gin.Context) {
|
||||||
@@ -41,57 +49,60 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
|
|||||||
metricsRecorder: &MetricsRecorder{
|
metricsRecorder: &MetricsRecorder{
|
||||||
metricsMonitor: pm.metricsMonitor,
|
metricsMonitor: pm.metricsMonitor,
|
||||||
realModelName: realModelName,
|
realModelName: realModelName,
|
||||||
isStreaming: gjson.GetBytes(bodyBytes, "stream").Bool(),
|
|
||||||
startTime: time.Now(),
|
startTime: time.Now(),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
c.Writer = writer
|
c.Writer = writer
|
||||||
c.Next()
|
c.Next()
|
||||||
|
|
||||||
rec := writer.metricsRecorder
|
// check for streaming response
|
||||||
rec.processBody(writer.body)
|
if strings.Contains(c.Writer.Header().Get("Content-Type"), "text/event-stream") {
|
||||||
}
|
writer.metricsRecorder.processStreamingResponse(writer.body)
|
||||||
}
|
|
||||||
|
|
||||||
type MetricsRecorder struct {
|
|
||||||
metricsMonitor *MetricsMonitor
|
|
||||||
realModelName string
|
|
||||||
isStreaming bool
|
|
||||||
startTime time.Time
|
|
||||||
}
|
|
||||||
|
|
||||||
// processBody handles response processing after request completes
|
|
||||||
func (rec *MetricsRecorder) processBody(body []byte) {
|
|
||||||
if rec.isStreaming {
|
|
||||||
rec.processStreamingResponse(body)
|
|
||||||
} else {
|
} else {
|
||||||
rec.processNonStreamingResponse(body)
|
writer.metricsRecorder.processNonStreamingResponse(writer.body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
|
func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
|
||||||
usage := jsonData.Get("usage")
|
usage := jsonData.Get("usage")
|
||||||
if !usage.Exists() {
|
timings := jsonData.Get("timings")
|
||||||
|
if !usage.Exists() && !timings.Exists() {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// default values
|
// default values
|
||||||
outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
|
cachedTokens := -1 // unknown or missing data
|
||||||
inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
|
outputTokens := 0
|
||||||
|
inputTokens := 0
|
||||||
|
|
||||||
|
// timings data
|
||||||
tokensPerSecond := -1.0
|
tokensPerSecond := -1.0
|
||||||
promptPerSecond := -1.0
|
promptPerSecond := -1.0
|
||||||
durationMs := int(time.Since(rec.startTime).Milliseconds())
|
durationMs := int(time.Since(rec.startTime).Milliseconds())
|
||||||
|
|
||||||
|
if usage.Exists() {
|
||||||
|
outputTokens = int(jsonData.Get("usage.completion_tokens").Int())
|
||||||
|
inputTokens = int(jsonData.Get("usage.prompt_tokens").Int())
|
||||||
|
}
|
||||||
|
|
||||||
// use llama-server's timing data for tok/sec and duration as it is more accurate
|
// use llama-server's timing data for tok/sec and duration as it is more accurate
|
||||||
if timings := jsonData.Get("timings"); timings.Exists() {
|
if timings.Exists() {
|
||||||
|
inputTokens = int(jsonData.Get("timings.prompt_n").Int())
|
||||||
|
outputTokens = int(jsonData.Get("timings.predicted_n").Int())
|
||||||
promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
|
promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
|
||||||
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
|
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
|
||||||
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
|
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
|
||||||
|
|
||||||
|
if cachedValue := jsonData.Get("timings.cache_n"); cachedValue.Exists() {
|
||||||
|
cachedTokens = int(cachedValue.Int())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rec.metricsMonitor.addMetrics(TokenMetrics{
|
rec.metricsMonitor.addMetrics(TokenMetrics{
|
||||||
Timestamp: time.Now(),
|
Timestamp: time.Now(),
|
||||||
Model: rec.realModelName,
|
Model: rec.realModelName,
|
||||||
|
CachedTokens: cachedTokens,
|
||||||
InputTokens: inputTokens,
|
InputTokens: inputTokens,
|
||||||
OutputTokens: outputTokens,
|
OutputTokens: outputTokens,
|
||||||
PromptPerSecond: promptPerSecond,
|
PromptPerSecond: promptPerSecond,
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ type TokenMetrics struct {
|
|||||||
ID int `json:"id"`
|
ID int `json:"id"`
|
||||||
Timestamp time.Time `json:"timestamp"`
|
Timestamp time.Time `json:"timestamp"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
|
CachedTokens int `json:"cache_tokens"`
|
||||||
InputTokens int `json:"input_tokens"`
|
InputTokens int `json:"input_tokens"`
|
||||||
OutputTokens int `json:"output_tokens"`
|
OutputTokens int `json:"output_tokens"`
|
||||||
PromptPerSecond float64 `json:"prompt_per_second"`
|
PromptPerSecond float64 `json:"prompt_per_second"`
|
||||||
@@ -61,7 +62,6 @@ func (mp *MetricsMonitor) addMetrics(metric TokenMetrics) {
|
|||||||
if len(mp.metrics) > mp.maxMetrics {
|
if len(mp.metrics) > mp.maxMetrics {
|
||||||
mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
|
mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
|
||||||
}
|
}
|
||||||
|
|
||||||
event.Emit(TokenMetricsEvent{Metrics: metric})
|
event.Emit(TokenMetricsEvent{Metrics: metric})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
@@ -363,8 +364,18 @@ func (p *Process) stopCommand() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (p *Process) checkHealthEndpoint(healthURL string) error {
|
func (p *Process) checkHealthEndpoint(healthURL string) error {
|
||||||
|
|
||||||
client := &http.Client{
|
client := &http.Client{
|
||||||
|
// wait a short time for a tcp connection to be established
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: (&net.Dialer{
|
||||||
Timeout: 500 * time.Millisecond,
|
Timeout: 500 * time.Millisecond,
|
||||||
|
}).DialContext,
|
||||||
|
},
|
||||||
|
|
||||||
|
// give a long time to respond to the health check endpoint
|
||||||
|
// after the connection is established. See issue: 276
|
||||||
|
Timeout: 5000 * time.Millisecond,
|
||||||
}
|
}
|
||||||
|
|
||||||
req, err := http.NewRequest("GET", healthURL, nil)
|
req, err := http.NewRequest("GET", healthURL, nil)
|
||||||
|
|||||||
@@ -60,10 +60,20 @@ func (pg *ProcessGroup) ProxyRequest(modelID string, writer http.ResponseWriter,
|
|||||||
if pg.swap {
|
if pg.swap {
|
||||||
pg.Lock()
|
pg.Lock()
|
||||||
if pg.lastUsedProcess != modelID {
|
if pg.lastUsedProcess != modelID {
|
||||||
|
|
||||||
|
// is there something already running?
|
||||||
if pg.lastUsedProcess != "" {
|
if pg.lastUsedProcess != "" {
|
||||||
pg.processes[pg.lastUsedProcess].Stop()
|
pg.processes[pg.lastUsedProcess].Stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// wait for the request to the new model to be fully handled
|
||||||
|
// and prevent race conditions see issue #277
|
||||||
|
pg.processes[modelID].ProxyRequest(writer, request)
|
||||||
pg.lastUsedProcess = modelID
|
pg.lastUsedProcess = modelID
|
||||||
|
|
||||||
|
// short circuit and exit
|
||||||
|
pg.Unlock()
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
pg.Unlock()
|
pg.Unlock()
|
||||||
}
|
}
|
||||||
|
|||||||
+34
-16
@@ -4,6 +4,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
@@ -44,32 +45,49 @@ func TestProcessGroup_HasMember(t *testing.T) {
|
|||||||
assert.False(t, pg.HasMember("model3"))
|
assert.False(t, pg.HasMember("model3"))
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestProcessGroup_ProxyRequestSwapIsTrue(t *testing.T) {
|
// TestProcessGroup_ProxyRequestSwapIsTrueParallel tests that when swap is true
|
||||||
|
// and multiple requests are made in parallel, only one process is running at a time.
|
||||||
|
func TestProcessGroup_ProxyRequestSwapIsTrueParallel(t *testing.T) {
|
||||||
|
var processGroupTestConfig = AddDefaultGroupToConfig(Config{
|
||||||
|
HealthCheckTimeout: 15,
|
||||||
|
Models: map[string]ModelConfig{
|
||||||
|
// use the same listening so if a model is already running, it will fail
|
||||||
|
// this is a way to test that swap isolation is working
|
||||||
|
// properly when there are parallel requests made at the
|
||||||
|
// same time.
|
||||||
|
"model1": getTestSimpleResponderConfigPort("model1", 9832),
|
||||||
|
"model2": getTestSimpleResponderConfigPort("model2", 9832),
|
||||||
|
"model3": getTestSimpleResponderConfigPort("model3", 9832),
|
||||||
|
"model4": getTestSimpleResponderConfigPort("model4", 9832),
|
||||||
|
"model5": getTestSimpleResponderConfigPort("model5", 9832),
|
||||||
|
},
|
||||||
|
Groups: map[string]GroupConfig{
|
||||||
|
"G1": {
|
||||||
|
Swap: true,
|
||||||
|
Members: []string{"model1", "model2", "model3", "model4", "model5"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
pg := NewProcessGroup("G1", processGroupTestConfig, testLogger, testLogger)
|
pg := NewProcessGroup("G1", processGroupTestConfig, testLogger, testLogger)
|
||||||
defer pg.StopProcesses(StopWaitForInflightRequest)
|
defer pg.StopProcesses(StopWaitForInflightRequest)
|
||||||
|
|
||||||
tests := []string{"model1", "model2"}
|
tests := []string{"model1", "model2", "model3", "model4", "model5"}
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
wg.Add(len(tests))
|
||||||
for _, modelName := range tests {
|
for _, modelName := range tests {
|
||||||
t.Run(modelName, func(t *testing.T) {
|
go func(modelName string) {
|
||||||
reqBody := `{"x", "y"}`
|
defer wg.Done()
|
||||||
req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))
|
req := httptest.NewRequest("POST", "/v1/chat/completions", nil)
|
||||||
w := httptest.NewRecorder()
|
w := httptest.NewRecorder()
|
||||||
|
|
||||||
assert.NoError(t, pg.ProxyRequest(modelName, w, req))
|
assert.NoError(t, pg.ProxyRequest(modelName, w, req))
|
||||||
assert.Equal(t, http.StatusOK, w.Code)
|
assert.Equal(t, http.StatusOK, w.Code)
|
||||||
assert.Contains(t, w.Body.String(), modelName)
|
assert.Contains(t, w.Body.String(), modelName)
|
||||||
|
}(modelName)
|
||||||
// make sure only one process is in the running state
|
|
||||||
count := 0
|
|
||||||
for _, process := range pg.processes {
|
|
||||||
if process.CurrentState() == StateReady {
|
|
||||||
count++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert.Equal(t, 1, count)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestProcessGroup_ProxyRequestSwapIsFalse(t *testing.T) {
|
func TestProcessGroup_ProxyRequestSwapIsFalse(t *testing.T) {
|
||||||
|
|||||||
+11
-2
@@ -191,11 +191,20 @@ func (pm *ProxyManager) setupGinEngine() {
|
|||||||
// Support legacy /v1/completions api, see issue #12
|
// Support legacy /v1/completions api, see issue #12
|
||||||
pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler)
|
||||||
|
|
||||||
// Support embeddings
|
// Support embeddings and reranking
|
||||||
pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler)
|
||||||
|
|
||||||
|
// llama-server's /reranking endpoint + aliases
|
||||||
|
pm.ginEngine.POST("/reranking", mm, pm.proxyOAIHandler)
|
||||||
|
pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)
|
||||||
pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler)
|
||||||
pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler)
|
||||||
pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler)
|
|
||||||
|
// llama-server's /infill endpoint for code infilling
|
||||||
|
pm.ginEngine.POST("/infill", mm, pm.proxyOAIHandler)
|
||||||
|
|
||||||
|
// llama-server's /completion endpoint
|
||||||
|
pm.ginEngine.POST("/completion", mm, pm.proxyOAIHandler)
|
||||||
|
|
||||||
// Support audio/speech endpoint
|
// Support audio/speech endpoint
|
||||||
pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)
|
||||||
|
|||||||
@@ -42,7 +42,6 @@ func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
|
|||||||
assert.Contains(t, w.Body.String(), modelName)
|
assert.Contains(t, w.Body.String(), modelName)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestProxyManager_SwapMultiProcess(t *testing.T) {
|
func TestProxyManager_SwapMultiProcess(t *testing.T) {
|
||||||
config := AddDefaultGroupToConfig(Config{
|
config := AddDefaultGroupToConfig(Config{
|
||||||
HealthCheckTimeout: 15,
|
HealthCheckTimeout: 15,
|
||||||
@@ -834,6 +833,28 @@ func TestProxyManager_HealthEndpoint(t *testing.T) {
|
|||||||
assert.Equal(t, "OK", rec.Body.String())
|
assert.Equal(t, "OK", rec.Body.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Ensure the custom llama-server /completion endpoint proxies correctly
|
||||||
|
func TestProxyManager_CompletionEndpoint(t *testing.T) {
|
||||||
|
config := AddDefaultGroupToConfig(Config{
|
||||||
|
HealthCheckTimeout: 15,
|
||||||
|
Models: map[string]ModelConfig{
|
||||||
|
"model1": getTestSimpleResponderConfig("model1"),
|
||||||
|
},
|
||||||
|
LogLevel: "error",
|
||||||
|
})
|
||||||
|
|
||||||
|
proxy := New(config)
|
||||||
|
defer proxy.StopProcesses(StopWaitForInflightRequest)
|
||||||
|
|
||||||
|
reqBody := `{"model":"model1"}`
|
||||||
|
req := httptest.NewRequest("POST", "/completion", bytes.NewBufferString(reqBody))
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
|
||||||
|
proxy.ServeHTTP(w, req)
|
||||||
|
assert.Equal(t, http.StatusOK, w.Code)
|
||||||
|
assert.Contains(t, w.Body.String(), "model1")
|
||||||
|
}
|
||||||
|
|
||||||
func TestProxyManager_StartupHooks(t *testing.T) {
|
func TestProxyManager_StartupHooks(t *testing.T) {
|
||||||
|
|
||||||
// using real YAML as the configuration has gotten more complex
|
// using real YAML as the configuration has gotten more complex
|
||||||
|
|||||||
+12
-22
@@ -1,33 +1,31 @@
|
|||||||
import { useEffect, useCallback } from "react";
|
import { useEffect, useCallback } from "react";
|
||||||
import { BrowserRouter as Router, Routes, Route, Navigate, NavLink } from "react-router-dom";
|
import { BrowserRouter as Router, Routes, Route, Navigate, NavLink } from "react-router-dom";
|
||||||
import { useTheme } from "./contexts/ThemeProvider";
|
import { useTheme } from "./contexts/ThemeProvider";
|
||||||
import { APIProvider } from "./contexts/APIProvider";
|
import { useAPI } from "./contexts/APIProvider";
|
||||||
import LogViewerPage from "./pages/LogViewer";
|
import LogViewerPage from "./pages/LogViewer";
|
||||||
import ModelPage from "./pages/Models";
|
import ModelPage from "./pages/Models";
|
||||||
import ActivityPage from "./pages/Activity";
|
import ActivityPage from "./pages/Activity";
|
||||||
import ConnectionStatus from "./components/ConnectionStatus";
|
import ConnectionStatusIcon from "./components/ConnectionStatus";
|
||||||
import { RiSunFill, RiMoonFill } from "react-icons/ri";
|
import { RiSunFill, RiMoonFill } from "react-icons/ri";
|
||||||
import { usePersistentState } from "./hooks/usePersistentState";
|
|
||||||
|
|
||||||
function App() {
|
function App() {
|
||||||
const { isNarrow, toggleTheme, isDarkMode } = useTheme();
|
const { isNarrow, toggleTheme, isDarkMode, appTitle, setAppTitle, setConnectionState } = useTheme();
|
||||||
const [appTitle, setAppTitle] = usePersistentState("app-title", "llama-swap");
|
|
||||||
|
|
||||||
const handleTitleChange = useCallback(
|
const handleTitleChange = useCallback(
|
||||||
(newTitle: string) => {
|
(newTitle: string) => {
|
||||||
setAppTitle(newTitle);
|
setAppTitle(newTitle.replace(/\n/g, "").trim().substring(0, 64) || "llama-swap");
|
||||||
document.title = newTitle;
|
|
||||||
},
|
},
|
||||||
[setAppTitle]
|
[setAppTitle]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const { connectionStatus } = useAPI();
|
||||||
|
|
||||||
|
// Synchronize the window.title connections state with the actual connection state
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
document.title = appTitle; // Set initial title
|
setConnectionState(connectionStatus);
|
||||||
}, [appTitle]);
|
}, [connectionStatus]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Router basename="/ui/">
|
<Router basename="/ui/">
|
||||||
<APIProvider>
|
|
||||||
<div className="flex flex-col h-screen">
|
<div className="flex flex-col h-screen">
|
||||||
<nav className="bg-surface border-b border-border p-2 h-[75px]">
|
<nav className="bg-surface border-b border-border p-2 h-[75px]">
|
||||||
<div className="flex items-center justify-between mx-auto px-4 h-full">
|
<div className="flex items-center justify-between mx-auto px-4 h-full">
|
||||||
@@ -36,16 +34,11 @@ function App() {
|
|||||||
contentEditable
|
contentEditable
|
||||||
suppressContentEditableWarning
|
suppressContentEditableWarning
|
||||||
className="flex items-center p-0 outline-none hover:bg-gray-100 dark:hover:bg-gray-700 rounded px-1"
|
className="flex items-center p-0 outline-none hover:bg-gray-100 dark:hover:bg-gray-700 rounded px-1"
|
||||||
onBlur={(e) =>
|
onBlur={(e) => handleTitleChange(e.currentTarget.textContent || "(set title)")}
|
||||||
handleTitleChange(e.currentTarget.textContent?.replace(/\n/g, "").trim() || "llama-swap")
|
|
||||||
}
|
|
||||||
onKeyDown={(e) => {
|
onKeyDown={(e) => {
|
||||||
if (e.key === "Enter") {
|
if (e.key === "Enter") {
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
const sanitizedText =
|
handleTitleChange(e.currentTarget.textContent || "(set title)");
|
||||||
e.currentTarget.textContent?.replace(/\n/g, "").trim().substring(0, 25) || "llama-swap";
|
|
||||||
handleTitleChange(sanitizedText);
|
|
||||||
e.currentTarget.textContent = sanitizedText;
|
|
||||||
e.currentTarget.blur();
|
e.currentTarget.blur();
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
@@ -57,18 +50,16 @@ function App() {
|
|||||||
<NavLink to="/" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
|
<NavLink to="/" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
|
||||||
Logs
|
Logs
|
||||||
</NavLink>
|
</NavLink>
|
||||||
|
|
||||||
<NavLink to="/models" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
|
<NavLink to="/models" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
|
||||||
Models
|
Models
|
||||||
</NavLink>
|
</NavLink>
|
||||||
|
|
||||||
<NavLink to="/activity" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
|
<NavLink to="/activity" className={({ isActive }) => (isActive ? "navlink active" : "navlink")}>
|
||||||
Activity
|
Activity
|
||||||
</NavLink>
|
</NavLink>
|
||||||
<button className="" onClick={toggleTheme}>
|
<button className="" onClick={toggleTheme}>
|
||||||
{isDarkMode ? <RiMoonFill /> : <RiSunFill />}
|
{isDarkMode ? <RiMoonFill /> : <RiSunFill />}
|
||||||
</button>
|
</button>
|
||||||
<ConnectionStatus />
|
<ConnectionStatusIcon />
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</nav>
|
</nav>
|
||||||
@@ -82,7 +73,6 @@ function App() {
|
|||||||
</Routes>
|
</Routes>
|
||||||
</main>
|
</main>
|
||||||
</div>
|
</div>
|
||||||
</APIProvider>
|
|
||||||
</Router>
|
</Router>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,21 +1,11 @@
|
|||||||
import { useAPI } from "../contexts/APIProvider";
|
import { useAPI } from "../contexts/APIProvider";
|
||||||
import { useEffect, useState, useMemo } from "react";
|
import { useMemo } from "react";
|
||||||
|
|
||||||
type ConnectionStatus = "disconnected" | "connecting" | "connected";
|
const ConnectionStatusIcon = () => {
|
||||||
|
const { connectionStatus } = useAPI();
|
||||||
const ConnectionStatus = () => {
|
|
||||||
const { getConnectionStatus } = useAPI();
|
|
||||||
const [eventStreamStatus, setEventStreamStatus] = useState<ConnectionStatus>("disconnected");
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
const interval = setInterval(() => {
|
|
||||||
setEventStreamStatus(getConnectionStatus());
|
|
||||||
}, 1000);
|
|
||||||
return () => clearInterval(interval);
|
|
||||||
});
|
|
||||||
|
|
||||||
const eventStatusColor = useMemo(() => {
|
const eventStatusColor = useMemo(() => {
|
||||||
switch (eventStreamStatus) {
|
switch (connectionStatus) {
|
||||||
case "connected":
|
case "connected":
|
||||||
return "bg-green-500";
|
return "bg-green-500";
|
||||||
case "connecting":
|
case "connecting":
|
||||||
@@ -24,13 +14,13 @@ const ConnectionStatus = () => {
|
|||||||
default:
|
default:
|
||||||
return "bg-red-500";
|
return "bg-red-500";
|
||||||
}
|
}
|
||||||
}, [eventStreamStatus]);
|
}, [connectionStatus]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="flex items-center" title={`event stream: ${eventStreamStatus}`}>
|
<div className="flex items-center" title={`event stream: ${connectionStatus}`}>
|
||||||
<span className={`inline-block w-3 h-3 rounded-full ${eventStatusColor} mr-2`}></span>
|
<span className={`inline-block w-3 h-3 rounded-full ${eventStatusColor} mr-2`}></span>
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
export default ConnectionStatus;
|
export default ConnectionStatusIcon;
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import { useRef, createContext, useState, useContext, useEffect, useCallback, useMemo, type ReactNode } from "react";
|
import { useRef, createContext, useState, useContext, useEffect, useCallback, useMemo, type ReactNode } from "react";
|
||||||
|
import type { ConnectionState } from "../lib/types";
|
||||||
|
|
||||||
type ModelStatus = "ready" | "starting" | "stopping" | "stopped" | "shutdown" | "unknown";
|
type ModelStatus = "ready" | "starting" | "stopping" | "stopped" | "shutdown" | "unknown";
|
||||||
const LOG_LENGTH_LIMIT = 1024 * 100; /* 100KB of log data */
|
const LOG_LENGTH_LIMIT = 1024 * 100; /* 100KB of log data */
|
||||||
@@ -20,13 +21,14 @@ interface APIProviderType {
|
|||||||
proxyLogs: string;
|
proxyLogs: string;
|
||||||
upstreamLogs: string;
|
upstreamLogs: string;
|
||||||
metrics: Metrics[];
|
metrics: Metrics[];
|
||||||
getConnectionStatus: () => "connected" | "connecting" | "disconnected";
|
connectionStatus: ConnectionState;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface Metrics {
|
interface Metrics {
|
||||||
id: number;
|
id: number;
|
||||||
timestamp: string;
|
timestamp: string;
|
||||||
model: string;
|
model: string;
|
||||||
|
cache_tokens: number;
|
||||||
input_tokens: number;
|
input_tokens: number;
|
||||||
output_tokens: number;
|
output_tokens: number;
|
||||||
prompt_per_second: number;
|
prompt_per_second: number;
|
||||||
@@ -53,6 +55,7 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
|
|||||||
const [proxyLogs, setProxyLogs] = useState("");
|
const [proxyLogs, setProxyLogs] = useState("");
|
||||||
const [upstreamLogs, setUpstreamLogs] = useState("");
|
const [upstreamLogs, setUpstreamLogs] = useState("");
|
||||||
const [metrics, setMetrics] = useState<Metrics[]>([]);
|
const [metrics, setMetrics] = useState<Metrics[]>([]);
|
||||||
|
const [connectionStatus, setConnectionState] = useState<ConnectionState>("disconnected");
|
||||||
const apiEventSource = useRef<EventSource | null>(null);
|
const apiEventSource = useRef<EventSource | null>(null);
|
||||||
|
|
||||||
const [models, setModels] = useState<Model[]>([]);
|
const [models, setModels] = useState<Model[]>([]);
|
||||||
@@ -64,16 +67,6 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
|
|||||||
});
|
});
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const getConnectionStatus = useCallback(() => {
|
|
||||||
if (apiEventSource.current?.readyState === EventSource.OPEN) {
|
|
||||||
return "connected";
|
|
||||||
} else if (apiEventSource.current?.readyState === EventSource.CONNECTING) {
|
|
||||||
return "connecting";
|
|
||||||
} else {
|
|
||||||
return "disconnected";
|
|
||||||
}
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
const enableAPIEvents = useCallback((enabled: boolean) => {
|
const enableAPIEvents = useCallback((enabled: boolean) => {
|
||||||
if (!enabled) {
|
if (!enabled) {
|
||||||
apiEventSource.current?.close();
|
apiEventSource.current?.close();
|
||||||
@@ -86,7 +79,9 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
|
|||||||
const initialDelay = 1000; // 1 second
|
const initialDelay = 1000; // 1 second
|
||||||
|
|
||||||
const connect = () => {
|
const connect = () => {
|
||||||
|
apiEventSource.current = null;
|
||||||
const eventSource = new EventSource("/api/events");
|
const eventSource = new EventSource("/api/events");
|
||||||
|
setConnectionState("connecting");
|
||||||
|
|
||||||
eventSource.onopen = () => {
|
eventSource.onopen = () => {
|
||||||
// clear everything out on connect to keep things in sync
|
// clear everything out on connect to keep things in sync
|
||||||
@@ -94,6 +89,9 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
|
|||||||
setUpstreamLogs("");
|
setUpstreamLogs("");
|
||||||
setMetrics([]); // clear metrics on reconnect
|
setMetrics([]); // clear metrics on reconnect
|
||||||
setModels([]); // clear models on reconnect
|
setModels([]); // clear models on reconnect
|
||||||
|
apiEventSource.current = eventSource;
|
||||||
|
retryCount = 0;
|
||||||
|
setConnectionState("connected");
|
||||||
};
|
};
|
||||||
|
|
||||||
eventSource.onmessage = (e: MessageEvent) => {
|
eventSource.onmessage = (e: MessageEvent) => {
|
||||||
@@ -138,14 +136,14 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
|
|||||||
console.error(e.data, err);
|
console.error(e.data, err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
eventSource.onerror = () => {
|
eventSource.onerror = () => {
|
||||||
eventSource.close();
|
eventSource.close();
|
||||||
retryCount++;
|
retryCount++;
|
||||||
const delay = Math.min(initialDelay * Math.pow(2, retryCount - 1), 5000);
|
const delay = Math.min(initialDelay * Math.pow(2, retryCount - 1), 5000);
|
||||||
|
setConnectionState("disconnected");
|
||||||
setTimeout(connect, delay);
|
setTimeout(connect, delay);
|
||||||
};
|
};
|
||||||
|
|
||||||
apiEventSource.current = eventSource;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
connect();
|
connect();
|
||||||
@@ -213,7 +211,7 @@ export function APIProvider({ children, autoStartAPIEvents = true }: APIProvider
|
|||||||
proxyLogs,
|
proxyLogs,
|
||||||
upstreamLogs,
|
upstreamLogs,
|
||||||
metrics,
|
metrics,
|
||||||
getConnectionStatus,
|
connectionStatus,
|
||||||
}),
|
}),
|
||||||
[models, listModels, unloadAllModels, loadModel, enableAPIEvents, proxyLogs, upstreamLogs, metrics]
|
[models, listModels, unloadAllModels, loadModel, enableAPIEvents, proxyLogs, upstreamLogs, metrics]
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import { createContext, useContext, useEffect, type ReactNode, useMemo, useState } from "react";
|
import { createContext, useContext, useEffect, type ReactNode, useMemo, useState } from "react";
|
||||||
import { usePersistentState } from "../hooks/usePersistentState";
|
import { usePersistentState } from "../hooks/usePersistentState";
|
||||||
|
import type { ConnectionState } from "../lib/types";
|
||||||
|
|
||||||
type ScreenWidth = "xs" | "sm" | "md" | "lg" | "xl" | "2xl";
|
type ScreenWidth = "xs" | "sm" | "md" | "lg" | "xl" | "2xl";
|
||||||
type ThemeContextType = {
|
type ThemeContextType = {
|
||||||
@@ -7,6 +8,11 @@ type ThemeContextType = {
|
|||||||
screenWidth: ScreenWidth;
|
screenWidth: ScreenWidth;
|
||||||
isNarrow: boolean;
|
isNarrow: boolean;
|
||||||
toggleTheme: () => void;
|
toggleTheme: () => void;
|
||||||
|
|
||||||
|
// for managing the window title and connection state information
|
||||||
|
appTitle: string;
|
||||||
|
setAppTitle: (title: string) => void;
|
||||||
|
setConnectionState: (state: ConnectionState) => void;
|
||||||
};
|
};
|
||||||
|
|
||||||
const ThemeContext = createContext<ThemeContextType | undefined>(undefined);
|
const ThemeContext = createContext<ThemeContextType | undefined>(undefined);
|
||||||
@@ -16,6 +22,17 @@ type ThemeProviderProps = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export function ThemeProvider({ children }: ThemeProviderProps) {
|
export function ThemeProvider({ children }: ThemeProviderProps) {
|
||||||
|
const [appTitle, setAppTitle] = usePersistentState("app-title", "llama-swap");
|
||||||
|
const [connectionState, setConnectionState] = useState<ConnectionState>("disconnected");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the document.title with informative information
|
||||||
|
*/
|
||||||
|
useEffect(() => {
|
||||||
|
const connectionIcon = connectionState === "connecting" ? "🟡" : connectionState === "connected" ? "🟢" : "🔴";
|
||||||
|
document.title = connectionIcon + " " + appTitle; // Set initial title
|
||||||
|
}, [appTitle, connectionState]);
|
||||||
|
|
||||||
const [isDarkMode, setIsDarkMode] = usePersistentState<boolean>("theme", false);
|
const [isDarkMode, setIsDarkMode] = usePersistentState<boolean>("theme", false);
|
||||||
const [screenWidth, setScreenWidth] = useState<ScreenWidth>("md"); // Default to md
|
const [screenWidth, setScreenWidth] = useState<ScreenWidth>("md"); // Default to md
|
||||||
|
|
||||||
@@ -55,7 +72,19 @@ export function ThemeProvider({ children }: ThemeProviderProps) {
|
|||||||
}, [screenWidth]);
|
}, [screenWidth]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<ThemeContext.Provider value={{ isDarkMode, toggleTheme, screenWidth, isNarrow }}>{children}</ThemeContext.Provider>
|
<ThemeContext.Provider
|
||||||
|
value={{
|
||||||
|
isDarkMode,
|
||||||
|
toggleTheme,
|
||||||
|
screenWidth,
|
||||||
|
isNarrow,
|
||||||
|
appTitle,
|
||||||
|
setAppTitle,
|
||||||
|
setConnectionState,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{children}
|
||||||
|
</ThemeContext.Provider>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
export type ConnectionState = "connected" | "connecting" | "disconnected";
|
||||||
@@ -3,11 +3,14 @@ import { createRoot } from "react-dom/client";
|
|||||||
import "./index.css";
|
import "./index.css";
|
||||||
import App from "./App.tsx";
|
import App from "./App.tsx";
|
||||||
import { ThemeProvider } from "./contexts/ThemeProvider";
|
import { ThemeProvider } from "./contexts/ThemeProvider";
|
||||||
|
import { APIProvider } from "./contexts/APIProvider";
|
||||||
|
|
||||||
createRoot(document.getElementById("root")!).render(
|
createRoot(document.getElementById("root")!).render(
|
||||||
<StrictMode>
|
<StrictMode>
|
||||||
<ThemeProvider>
|
<ThemeProvider>
|
||||||
|
<APIProvider>
|
||||||
<App />
|
<App />
|
||||||
|
</APIProvider>
|
||||||
</ThemeProvider>
|
</ThemeProvider>
|
||||||
</StrictMode>
|
</StrictMode>
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,10 +1,6 @@
|
|||||||
import { useMemo } from "react";
|
import { useMemo } from "react";
|
||||||
import { useAPI } from "../contexts/APIProvider";
|
import { useAPI } from "../contexts/APIProvider";
|
||||||
|
|
||||||
const formatTimestamp = (timestamp: string): string => {
|
|
||||||
return new Date(timestamp).toLocaleString();
|
|
||||||
};
|
|
||||||
|
|
||||||
const formatSpeed = (speed: number): string => {
|
const formatSpeed = (speed: number): string => {
|
||||||
return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
|
return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
|
||||||
};
|
};
|
||||||
@@ -13,6 +9,33 @@ const formatDuration = (ms: number): string => {
|
|||||||
return (ms / 1000).toFixed(2) + "s";
|
return (ms / 1000).toFixed(2) + "s";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const formatRelativeTime = (timestamp: string): string => {
|
||||||
|
const now = new Date();
|
||||||
|
const date = new Date(timestamp);
|
||||||
|
const diffInSeconds = Math.floor((now.getTime() - date.getTime()) / 1000);
|
||||||
|
|
||||||
|
// Handle future dates by returning "just now"
|
||||||
|
if (diffInSeconds < 5) {
|
||||||
|
return "now";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (diffInSeconds < 60) {
|
||||||
|
return `${diffInSeconds}s ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const diffInMinutes = Math.floor(diffInSeconds / 60);
|
||||||
|
if (diffInMinutes < 60) {
|
||||||
|
return `${diffInMinutes}m ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const diffInHours = Math.floor(diffInMinutes / 60);
|
||||||
|
if (diffInHours < 24) {
|
||||||
|
return `${diffInHours}h ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return "a while ago";
|
||||||
|
};
|
||||||
|
|
||||||
const ActivityPage = () => {
|
const ActivityPage = () => {
|
||||||
const { metrics } = useAPI();
|
const { metrics } = useAPI();
|
||||||
const sortedMetrics = useMemo(() => {
|
const sortedMetrics = useMemo(() => {
|
||||||
@@ -32,11 +55,16 @@ const ActivityPage = () => {
|
|||||||
<table className="min-w-full divide-y">
|
<table className="min-w-full divide-y">
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">Id</th>
|
<th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">ID</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Timestamp</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Time</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
|
Cached <Tooltip content="prompt tokens from cache" />
|
||||||
|
</th>
|
||||||
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
|
||||||
|
Prompt <Tooltip content="new prompt tokens processed" />
|
||||||
|
</th>
|
||||||
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generated</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
|
||||||
@@ -46,8 +74,11 @@ const ActivityPage = () => {
|
|||||||
{sortedMetrics.map((metric) => (
|
{sortedMetrics.map((metric) => (
|
||||||
<tr key={`metric_${metric.id}`}>
|
<tr key={`metric_${metric.id}`}>
|
||||||
<td className="px-4 py-4 whitespace-nowrap text-sm">{metric.id + 1 /* un-zero index */}</td>
|
<td className="px-4 py-4 whitespace-nowrap text-sm">{metric.id + 1 /* un-zero index */}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatTimestamp(metric.timestamp)}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatRelativeTime(metric.timestamp)}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
|
||||||
|
<td className="px-6 py-4 whitespace-nowrap text-sm">
|
||||||
|
{metric.cache_tokens > 0 ? metric.cache_tokens.toLocaleString() : "-"}
|
||||||
|
</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
|
||||||
@@ -63,4 +94,28 @@ const ActivityPage = () => {
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
interface TooltipProps {
|
||||||
|
content: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Tooltip: React.FC<TooltipProps> = ({ content }) => {
|
||||||
|
return (
|
||||||
|
<div className="relative group inline-block">
|
||||||
|
ⓘ
|
||||||
|
<div
|
||||||
|
className="absolute top-full left-1/2 transform -translate-x-1/2 mt-2
|
||||||
|
px-3 py-2 bg-gray-900 text-white text-sm rounded-md
|
||||||
|
opacity-0 group-hover:opacity-100 transition-opacity
|
||||||
|
duration-200 pointer-events-none whitespace-nowrap z-50 normal-case"
|
||||||
|
>
|
||||||
|
{content}
|
||||||
|
<div
|
||||||
|
className="absolute bottom-full left-1/2 transform -translate-x-1/2
|
||||||
|
border-4 border-transparent border-b-gray-900"
|
||||||
|
></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
export default ActivityPage;
|
export default ActivityPage;
|
||||||
|
|||||||
Reference in New Issue
Block a user