Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 85cd74a51c | |||
| 314d2f2212 | |||
| fad25f3e11 | |||
| 2c3e3e27f7 | |||
| baeb0c4e7f | |||
| 2833517eef | |||
| abdc2bfdb3 | |||
| c3b834737f | |||
| 3c8e727b73 |
@@ -5,10 +5,12 @@
|
|||||||
# Introduction
|
# Introduction
|
||||||
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
|
||||||
|
|
||||||
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). Download a pre-built [release](https://github.com/mostlygeek/llama-swap/releases) or built it yourself from source with `make clean all`.
|
Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file).
|
||||||
|
|
||||||
|
Download a pre-built [release](https://github.com/mostlygeek/llama-swap/releases) or build it yourself from source with `make clean all`.
|
||||||
|
|
||||||
## How does it work?
|
## How does it work?
|
||||||
When a request is made to an OpenAI compatible endpoints, lama-swap will extract the `model` value load the appropriate server configuration to serve it. If a server is already running it will stop it and start a new one. This is where the "swap" part comes in. The upstream server is automatically swapped to the correct one to serve the request.
|
When a request is made to an OpenAI compatible endpoint, lama-swap will extract the `model` value and load the appropriate server configuration to serve it. If a server is already running it will stop it and start the correct one. This is where the "swap" part comes in. The upstream server is automatically swapped to the correct one to serve the request.
|
||||||
|
|
||||||
In the most basic configuration llama-swap handles one model at a time. For more advanced use cases, the `profiles` feature can load multiple models at the same time. You have complete control over how your system resources are used.
|
In the most basic configuration llama-swap handles one model at a time. For more advanced use cases, the `profiles` feature can load multiple models at the same time. You have complete control over how your system resources are used.
|
||||||
|
|
||||||
@@ -26,8 +28,9 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
|
|||||||
- `v1/chat/completions`
|
- `v1/chat/completions`
|
||||||
- `v1/embeddings`
|
- `v1/embeddings`
|
||||||
- `v1/rerank`
|
- `v1/rerank`
|
||||||
- `v1/audio/speech`
|
- `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
|
||||||
- ✅ Multiple GPU support
|
- ✅ Multiple GPU support
|
||||||
|
- ✅ Docker and Podman support
|
||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
- ✅ Automatic unloading of models from GPUs after timeout
|
- ✅ Automatic unloading of models from GPUs after timeout
|
||||||
@@ -87,6 +90,15 @@ models:
|
|||||||
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||||
unlisted: true
|
unlisted: true
|
||||||
|
|
||||||
|
# Docker Support (v26.1.4+ required!)
|
||||||
|
"docker-llama":
|
||||||
|
proxy: "http://127.0.0.1:9790"
|
||||||
|
cmd: >
|
||||||
|
docker run --name dockertest
|
||||||
|
--init --rm -p 9790:8080 -v /mnt/nvme/models:/models
|
||||||
|
ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
|
|
||||||
# profiles make it easy to managing multi model (and gpu) configurations.
|
# profiles make it easy to managing multi model (and gpu) configurations.
|
||||||
#
|
#
|
||||||
# Tips:
|
# Tips:
|
||||||
|
|||||||
@@ -53,6 +53,14 @@ models:
|
|||||||
--ctx-size 8192
|
--ctx-size 8192
|
||||||
--reranking
|
--reranking
|
||||||
|
|
||||||
|
# Docker Support (v26.1.4+ required!)
|
||||||
|
"dockertest":
|
||||||
|
proxy: "http://127.0.0.1:9790"
|
||||||
|
cmd: >
|
||||||
|
docker run --name dockertest
|
||||||
|
--init --rm -p 9790:8080 -v /mnt/nvme/models:/models
|
||||||
|
ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||||
|
|
||||||
"simple":
|
"simple":
|
||||||
# example of setting environment variables
|
# example of setting environment variables
|
||||||
|
|||||||
@@ -4,6 +4,8 @@ import (
|
|||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/mostlygeek/llama-swap/proxy"
|
"github.com/mostlygeek/llama-swap/proxy"
|
||||||
@@ -39,6 +41,16 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
proxyManager := proxy.New(config)
|
proxyManager := proxy.New(config)
|
||||||
|
|
||||||
|
sigChan := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
go func() {
|
||||||
|
<-sigChan
|
||||||
|
fmt.Println("Shutting down llama-swap")
|
||||||
|
proxyManager.StopProcesses()
|
||||||
|
os.Exit(0)
|
||||||
|
}()
|
||||||
|
|
||||||
fmt.Println("llama-swap listening on " + *listenStr)
|
fmt.Println("llama-swap listening on " + *listenStr)
|
||||||
if err := proxyManager.Run(*listenStr); err != nil {
|
if err := proxyManager.Run(*listenStr); err != nil {
|
||||||
fmt.Printf("Server error: %v\n", err)
|
fmt.Printf("Server error: %v\n", err)
|
||||||
|
|||||||
+24
-54
@@ -2,7 +2,6 @@ package proxy
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
@@ -86,36 +85,11 @@ func (p *Process) start() error {
|
|||||||
// 3. The health check passes
|
// 3. The health check passes
|
||||||
//
|
//
|
||||||
// only in the third case will the process be considered Ready to accept
|
// only in the third case will the process be considered Ready to accept
|
||||||
healthCheckContext, cancelHealthCheck := context.WithCancelCause(context.Background())
|
<-time.After(250 * time.Millisecond) // give process a bit of time to start
|
||||||
defer cancelHealthCheck(nil) // clean up
|
|
||||||
cmdWaitChan := make(chan error, 1)
|
|
||||||
healthCheckChan := make(chan error, 1)
|
|
||||||
|
|
||||||
go func() {
|
if err := p.checkHealthEndpoint(); err != nil {
|
||||||
// possible cmd exits early
|
|
||||||
cmdWaitChan <- p.cmd.Wait()
|
|
||||||
}()
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
<-time.After(250 * time.Millisecond) // give process a bit of time to start
|
|
||||||
healthCheckChan <- p.checkHealthEndpoint(healthCheckContext)
|
|
||||||
}()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case err := <-cmdWaitChan:
|
|
||||||
p.state = StateFailed
|
p.state = StateFailed
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("command [%s] %s", strings.Join(p.cmd.Args, " "), err.Error())
|
|
||||||
} else {
|
|
||||||
err = fmt.Errorf("command [%s] exited unexpected", strings.Join(p.cmd.Args, " "))
|
|
||||||
}
|
|
||||||
cancelHealthCheck(err)
|
|
||||||
return err
|
return err
|
||||||
case err := <-healthCheckChan:
|
|
||||||
if err != nil {
|
|
||||||
p.state = StateFailed
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.config.UnloadAfter > 0 {
|
if p.config.UnloadAfter > 0 {
|
||||||
@@ -135,6 +109,7 @@ func (p *Process) start() error {
|
|||||||
if time.Since(p.lastRequestHandled) > maxDuration {
|
if time.Since(p.lastRequestHandled) > maxDuration {
|
||||||
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
|
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
|
||||||
p.Stop()
|
p.Stop()
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
@@ -152,20 +127,17 @@ func (p *Process) Stop() {
|
|||||||
defer p.stateMutex.Unlock()
|
defer p.stateMutex.Unlock()
|
||||||
|
|
||||||
if p.state != StateReady {
|
if p.state != StateReady {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! Info - Stop() called but Process State is not READY\n")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.cmd == nil || p.cmd.Process == nil {
|
if p.cmd == nil || p.cmd.Process == nil {
|
||||||
// this situation should never happen... but if it does just update the state
|
// this situation should never happen... but if it does just update the state
|
||||||
fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
|
fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
|
||||||
p.state = StateStopped
|
p.state = StateStopped
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pretty sure this stopping code needs some work for windows and
|
|
||||||
// will be a source of pain in the future.
|
|
||||||
|
|
||||||
p.cmd.Process.Signal(syscall.SIGTERM)
|
|
||||||
sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
@@ -174,20 +146,31 @@ func (p *Process) Stop() {
|
|||||||
sigtermNormal <- p.cmd.Wait()
|
sigtermNormal <- p.cmd.Wait()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
p.cmd.Process.Signal(syscall.SIGTERM)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-sigtermTimeout.Done():
|
case <-sigtermTimeout.Done():
|
||||||
fmt.Fprintf(p.logMonitor, "!!! process for %s timed out waiting to stop\n", p.ID)
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] timed out waiting to stop, sending KILL signal\n", p.ID)
|
||||||
p.cmd.Process.Kill()
|
p.cmd.Process.Kill()
|
||||||
p.cmd.Wait()
|
|
||||||
case err := <-sigtermNormal:
|
case err := <-sigtermNormal:
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err.Error() != "wait: no child processes" {
|
if errno, ok := err.(syscall.Errno); ok {
|
||||||
// possible that simple-responder for testing is just not
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] errno >> %v\n", p.ID, errno)
|
||||||
// existing right, so suppress those errors.
|
} else if exitError, ok := err.(*exec.ExitError); ok {
|
||||||
fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
|
if strings.Contains(exitError.String(), "signal: terminated") {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] stopped OK\n", p.ID)
|
||||||
|
} else if strings.Contains(exitError.String(), "signal: interrupt") {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] interrupted OK\n", p.ID)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] ExitError >> %v, exit code: %d\n", p.ID, exitError, exitError.ExitCode())
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] exited >> %v\n", p.ID, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
p.state = StateStopped
|
p.state = StateStopped
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,7 +180,7 @@ func (p *Process) CurrentState() ProcessState {
|
|||||||
return p.state
|
return p.state
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error {
|
func (p *Process) checkHealthEndpoint() error {
|
||||||
if p.config.Proxy == "" {
|
if p.config.Proxy == "" {
|
||||||
return fmt.Errorf("no upstream available to check /health")
|
return fmt.Errorf("no upstream available to check /health")
|
||||||
}
|
}
|
||||||
@@ -229,24 +212,11 @@ func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctxFromStart, time.Second)
|
|
||||||
defer cancel()
|
|
||||||
req = req.WithContext(ctx)
|
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
|
|
||||||
ttl := (maxDuration - time.Since(startTime)).Seconds()
|
ttl := (maxDuration - time.Since(startTime)).Seconds()
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// check if the context was cancelled
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
err := context.Cause(ctx)
|
|
||||||
if !errors.Is(err, context.DeadlineExceeded) {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
}
|
|
||||||
|
|
||||||
// wait a bit longer for TCP connection issues
|
// wait a bit longer for TCP connection issues
|
||||||
if strings.Contains(err.Error(), "connection refused") {
|
if strings.Contains(err.Error(), "connection refused") {
|
||||||
fmt.Fprintf(p.logMonitor, "Connection refused on %s, ttl %.0fs\n", healthURL, ttl)
|
fmt.Fprintf(p.logMonitor, "Connection refused on %s, ttl %.0fs\n", healthURL, ttl)
|
||||||
@@ -294,7 +264,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
proxyTo := p.config.Proxy
|
proxyTo := p.config.Proxy
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
|
req, err := http.NewRequestWithContext(r.Context(), r.Method, proxyTo+r.URL.String(), r.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
return
|
return
|
||||||
|
|||||||
+28
-2
@@ -67,7 +67,6 @@ func TestProcess_BrokenModelConfig(t *testing.T) {
|
|||||||
assert.Contains(t, w.Body.String(), "unable to start process")
|
assert.Contains(t, w.Body.String(), "unable to start process")
|
||||||
}
|
}
|
||||||
|
|
||||||
// test that the process unloads after the TTL
|
|
||||||
func TestProcess_UnloadAfterTTL(t *testing.T) {
|
func TestProcess_UnloadAfterTTL(t *testing.T) {
|
||||||
if testing.Short() {
|
if testing.Short() {
|
||||||
t.Skip("skipping long auto unload TTL test")
|
t.Skip("skipping long auto unload TTL test")
|
||||||
@@ -79,7 +78,7 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
|
|||||||
config.UnloadAfter = 3 // seconds
|
config.UnloadAfter = 3 // seconds
|
||||||
assert.Equal(t, 3, config.UnloadAfter)
|
assert.Equal(t, 3, config.UnloadAfter)
|
||||||
|
|
||||||
process := NewProcess("ttl", 2, config, NewLogMonitorWriter(io.Discard))
|
process := NewProcess("ttl_test", 2, config, NewLogMonitorWriter(io.Discard))
|
||||||
defer process.Stop()
|
defer process.Stop()
|
||||||
|
|
||||||
// this should take 4 seconds
|
// this should take 4 seconds
|
||||||
@@ -111,6 +110,33 @@ func TestProcess_UnloadAfterTTL(t *testing.T) {
|
|||||||
assert.Equal(t, StateStopped, process.CurrentState())
|
assert.Equal(t, StateStopped, process.CurrentState())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestProcess_LowTTLValue(t *testing.T) {
|
||||||
|
if true { // change this code to run this ...
|
||||||
|
t.Skip("skipping test, edit process_test.go to run it ")
|
||||||
|
}
|
||||||
|
|
||||||
|
config := getTestSimpleResponderConfig("fast_ttl")
|
||||||
|
assert.Equal(t, 0, config.UnloadAfter)
|
||||||
|
config.UnloadAfter = 1 // second
|
||||||
|
assert.Equal(t, 1, config.UnloadAfter)
|
||||||
|
|
||||||
|
process := NewProcess("ttl", 2, config, NewLogMonitorWriter(os.Stdout))
|
||||||
|
defer process.Stop()
|
||||||
|
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
t.Logf("Waiting before sending request %d", i)
|
||||||
|
time.Sleep(1500 * time.Millisecond)
|
||||||
|
|
||||||
|
expected := fmt.Sprintf("echo=test_%d", i)
|
||||||
|
req := httptest.NewRequest("GET", fmt.Sprintf("/slow-respond?echo=%s&delay=50ms", expected), nil)
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
process.ProxyRequest(w, req)
|
||||||
|
assert.Equal(t, http.StatusOK, w.Code)
|
||||||
|
assert.Contains(t, w.Body.String(), expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// issue #19
|
// issue #19
|
||||||
func TestProcess_HTTPRequestsHaveTimeToFinish(t *testing.T) {
|
func TestProcess_HTTPRequestsHaveTimeToFinish(t *testing.T) {
|
||||||
if testing.Short() {
|
if testing.Short() {
|
||||||
|
|||||||
@@ -69,6 +69,19 @@ func New(config *Config) *ProxyManager {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// see: https://github.com/mostlygeek/llama-swap/issues/42
|
||||||
|
// respond with permissive OPTIONS for any endpoint
|
||||||
|
pm.ginEngine.Use(func(c *gin.Context) {
|
||||||
|
if c.Request.Method == "OPTIONS" {
|
||||||
|
c.Header("Access-Control-Allow-Origin", "*")
|
||||||
|
c.Header("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||||
|
c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization")
|
||||||
|
c.AbortWithStatus(204)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Next()
|
||||||
|
})
|
||||||
|
|
||||||
// Set up routes using the Gin engine
|
// Set up routes using the Gin engine
|
||||||
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
pm.ginEngine.POST("/v1/chat/completions", pm.proxyOAIHandler)
|
||||||
// Support legacy /v1/completions api, see issue #12
|
// Support legacy /v1/completions api, see issue #12
|
||||||
@@ -202,6 +215,21 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
|
|||||||
return nil, fmt.Errorf("could not find modelID for %s", requestedModel)
|
return nil, fmt.Errorf("could not find modelID for %s", requestedModel)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if model is part of the profile
|
||||||
|
if profileName != "" {
|
||||||
|
found := false
|
||||||
|
for _, item := range pm.config.Profiles[profileName] {
|
||||||
|
if item == realModelName {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !found {
|
||||||
|
return nil, fmt.Errorf("model %s part of profile %s", realModelName, profileName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// exit early when already running, otherwise stop everything and swap
|
// exit early when already running, otherwise stop everything and swap
|
||||||
requestedProcessKey := ProcessKeyName(profileName, realModelName)
|
requestedProcessKey := ProcessKeyName(profileName, realModelName)
|
||||||
|
|
||||||
|
|||||||
@@ -210,3 +210,47 @@ func TestProxyManager_ListModelsHandler(t *testing.T) {
|
|||||||
// Ensure all expected models were returned
|
// Ensure all expected models were returned
|
||||||
assert.Empty(t, expectedModels, "not all expected models were returned")
|
assert.Empty(t, expectedModels, "not all expected models were returned")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestProxyManager_ProfileNonMember(t *testing.T) {
|
||||||
|
|
||||||
|
model1 := "path1/model1"
|
||||||
|
model2 := "path2/model2"
|
||||||
|
|
||||||
|
profileMemberName := ProcessKeyName("test", model1)
|
||||||
|
profileNonMemberName := ProcessKeyName("test", model2)
|
||||||
|
|
||||||
|
config := &Config{
|
||||||
|
HealthCheckTimeout: 15,
|
||||||
|
Models: map[string]ModelConfig{
|
||||||
|
model1: getTestSimpleResponderConfig("model1"),
|
||||||
|
model2: getTestSimpleResponderConfig("model2"),
|
||||||
|
},
|
||||||
|
Profiles: map[string][]string{
|
||||||
|
"test": {model1},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
proxy := New(config)
|
||||||
|
defer proxy.StopProcesses()
|
||||||
|
|
||||||
|
// actual member of profile
|
||||||
|
{
|
||||||
|
reqBody := fmt.Sprintf(`{"model":"%s"}`, profileMemberName)
|
||||||
|
req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
|
||||||
|
proxy.HandlerFunc(w, req)
|
||||||
|
assert.Equal(t, http.StatusOK, w.Code)
|
||||||
|
assert.Contains(t, w.Body.String(), "model1")
|
||||||
|
}
|
||||||
|
|
||||||
|
// actual model, but non-member will 404
|
||||||
|
{
|
||||||
|
reqBody := fmt.Sprintf(`{"model":"%s"}`, profileNonMemberName)
|
||||||
|
req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
|
||||||
|
proxy.HandlerFunc(w, req)
|
||||||
|
assert.Equal(t, http.StatusNotFound, w.Code)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user