Compare commits

..

5 Commits

Author SHA1 Message Date
Benson Wong 5672cb03fd Update github actions for notifying homebrew build (#212)
Combine homebrew-llama-swap event with the release action
2025-07-30 11:29:03 -07:00
Benson Wong 0f583163f7 add /health (#211) 2025-07-30 10:37:10 -07:00
Benson Wong 7905fa9ea3 Update trigger-homebrew-update.yml [skip ci] 2025-07-30 10:13:49 -07:00
Ian Sebastian Mathew bbaf172956 add trigger to rebuild homebrew formula (#210) 2025-07-30 10:12:21 -07:00
Benson Wong fd50932dbc Decouple MetricsMiddleware from downstream handlers (#206)
* Decouple MetricsMiddleware from downstream handlers

Remove ls-real-model-name optimization. Within proxyOAIHandler the
request body's bytes are required for various rewriting features
anyways. This negated any benefits from trying not to parse it twice.
2025-07-27 10:36:06 -07:00
5 changed files with 81 additions and 7 deletions
+33 -3
View File
@@ -7,6 +7,10 @@ on:
# Allows manual triggering of the workflow # Allows manual triggering of the workflow
workflow_dispatch: workflow_dispatch:
inputs:
tag:
description: 'Tag version to release (e.g. v144)'
required: true
permissions: permissions:
contents: write contents: write
@@ -20,15 +24,15 @@ jobs:
uses: actions/checkout@v4 uses: actions/checkout@v4
with: with:
fetch-depth: 0 fetch-depth: 0
ref: ${{ github.event.inputs.tag || github.ref }}
- -
name: Set up Go name: Set up Go
uses: actions/setup-go@v5 uses: actions/setup-go@v5
- -
name: Set up Node.js name: Set up Node.js
uses: actions/setup-node@v4 uses: actions/setup-node@v4
with: with:
node-version: '23' # or your preferred version node-version: '23'
- -
name: Install dependencies and build UI name: Install dependencies and build UI
run: | run: |
@@ -46,4 +50,30 @@ jobs:
version: '~> v2' version: '~> v2'
args: release --clean args: release --clean
env: env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
trigger-tap-update:
runs-on: ubuntu-latest
needs: goreleaser
steps:
- name: "Resolve tag to dispatch"
id: tag
run: |
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
echo "tag=${{ github.event.inputs.tag }}" >> "$GITHUB_OUTPUT"
else
echo "tag=${{ github.ref_name }}" >> "$GITHUB_OUTPUT"
fi
- name: "Trigger tap repository update"
uses: peter-evans/repository-dispatch@v2
with:
token: ${{ secrets.TAP_REPO_PAT }}
repository: mostlygeek/homebrew-llama-swap
event-type: new-release
client-payload: |
{
"release": {
"tag_name": "${{ steps.tag.outputs.tag }}"
}
}
+14
View File
@@ -122,6 +122,20 @@ $ docker run -it --rm --runtime nvidia -p 9292:8080 \
</details> </details>
## Homebrew Install (macOS/Linux)
For macOS & Linux users, `llama-swap` can be installed via [Homebrew](https://brew.sh):
```shell
# Set up tap and install formula
brew tap mostlygeek/llama-swap
brew install llama-swap
# Run llama-swap
llama-swap --config path/to/config.yaml --listen localhost:8080
```
This will install the `llama-swap` binary and make it available in your path. See the [configuration documentation](https://github.com/mostlygeek/llama-swap/wiki/Configuration)
## Bare metal Install ([download](https://github.com/mostlygeek/llama-swap/releases)) ## Bare metal Install ([download](https://github.com/mostlygeek/llama-swap/releases))
Pre-built binaries are available for Linux, Mac, Windows and FreeBSD. These are automatically published and are likely a few hours ahead of the docker releases. The baremetal install works with any OpenAI compatible server, not just llama-server. Pre-built binaries are available for Linux, Mac, Windows and FreeBSD. These are automatically published and are likely a few hours ahead of the docker releases. The baremetal install works with any OpenAI compatible server, not just llama-server.
+3 -1
View File
@@ -17,6 +17,7 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
bodyBytes, err := io.ReadAll(c.Request.Body) bodyBytes, err := io.ReadAll(c.Request.Body)
if err != nil { if err != nil {
pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body") pm.sendErrorResponse(c, http.StatusBadRequest, "could not ready request body")
c.Abort()
return return
} }
c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes)) c.Request.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
@@ -24,15 +25,16 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
requestedModel := gjson.GetBytes(bodyBytes, "model").String() requestedModel := gjson.GetBytes(bodyBytes, "model").String()
if requestedModel == "" { if requestedModel == "" {
pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key") pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
c.Abort()
return return
} }
realModelName, found := pm.config.RealModelName(requestedModel) realModelName, found := pm.config.RealModelName(requestedModel)
if !found { if !found {
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel)) pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
c.Abort()
return return
} }
c.Set("ls-real-model-name", realModelName)
writer := &MetricsResponseWriter{ writer := &MetricsResponseWriter{
ResponseWriter: c.Writer, ResponseWriter: c.Writer,
+13 -3
View File
@@ -14,6 +14,7 @@ import (
"time" "time"
"github.com/gin-gonic/gin" "github.com/gin-gonic/gin"
"github.com/tidwall/gjson"
"github.com/tidwall/sjson" "github.com/tidwall/sjson"
) )
@@ -190,6 +191,9 @@ func (pm *ProxyManager) setupGinEngine() {
pm.ginEngine.GET("/unload", pm.unloadAllModelsHandler) pm.ginEngine.GET("/unload", pm.unloadAllModelsHandler)
pm.ginEngine.GET("/running", pm.listRunningProcessesHandler) pm.ginEngine.GET("/running", pm.listRunningProcessesHandler)
pm.ginEngine.GET("/health", func(c *gin.Context) {
c.String(http.StatusOK, "OK")
})
pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) { pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
if data, err := reactStaticFS.ReadFile("ui_dist/favicon.ico"); err == nil { if data, err := reactStaticFS.ReadFile("ui_dist/favicon.ico"); err == nil {
@@ -367,9 +371,15 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
return return
} }
realModelName := c.GetString("ls-real-model-name") // Should be set in MetricsMiddleware requestedModel := gjson.GetBytes(bodyBytes, "model").String()
if realModelName == "" { if requestedModel == "" {
pm.sendErrorResponse(c, http.StatusInternalServerError, "ls-real-model-name not set") pm.sendErrorResponse(c, http.StatusBadRequest, "missing or invalid 'model' key")
return
}
realModelName, found := pm.config.RealModelName(requestedModel)
if !found {
pm.sendErrorResponse(c, http.StatusBadRequest, fmt.Sprintf("could not find real modelID for %s", requestedModel))
return return
} }
+18
View File
@@ -755,3 +755,21 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {
assert.Greater(t, lastMetric.TokensPerSecond, 0.0, "tokens per second should be greater than 0") assert.Greater(t, lastMetric.TokensPerSecond, 0.0, "tokens per second should be greater than 0")
assert.Greater(t, lastMetric.DurationMs, 0, "duration should be greater than 0") assert.Greater(t, lastMetric.DurationMs, 0, "duration should be greater than 0")
} }
func TestProxyManager_HealthEndpoint(t *testing.T) {
config := AddDefaultGroupToConfig(Config{
HealthCheckTimeout: 15,
Models: map[string]ModelConfig{
"model1": getTestSimpleResponderConfig("model1"),
},
LogLevel: "error",
})
proxy := New(config)
defer proxy.StopProcesses(StopWaitForInflightRequest)
req := httptest.NewRequest("GET", "/health", nil)
rec := httptest.NewRecorder()
proxy.ServeHTTP(rec, req)
assert.Equal(t, http.StatusOK, rec.Code)
assert.Equal(t, "OK", rec.Body.String())
}