Compare commits

...

4 Commits

Author SHA1 Message Date
Benson Wong 25f251699c Prevent StateFailed after SIGKILL (#129)
Closes #125
2025-05-14 10:47:35 -07:00
Benson Wong 7f37bcc6eb Improve testing around using SIGKILL (#127)
* Add test for SIGKILL of process
* silent TestProxyManager_RunningEndpoint debug output
* Ref #125
2025-05-13 21:21:52 -07:00
Benson Wong 519c3a4d22 Change /unload to not wait for inflight requests (#125)
Sometimes upstreams can accept HTTP but never respond causing requests
to build up waiting for a response. This can block Process.Stop() as
that waits for inflight requests to finish. This change refactors the
code to not wait when attempting to shutdown the process.
2025-05-13 11:39:19 -07:00
Benson Wong 9dc4bcb46c Add a concurrency limit to Process.ProxyRequest (#123) 2025-05-12 18:12:52 -07:00
11 changed files with 228 additions and 34 deletions
+2 -2
View File
@@ -20,10 +20,10 @@ clean:
rm -rf $(BUILD_DIR)
test:
go test -short -v ./proxy
go test -short -v -count=1 ./proxy
test-all:
go test -v ./proxy
go test -v -count=1 ./proxy
# Build OSX binary
mac:
+1 -1
View File
@@ -84,7 +84,7 @@ func main() {
case newManager := <-reloadChan:
log.Println("Config change detected, waiting for in-flight requests to complete...")
// Stop old manager processes gracefully (this waits for in-flight requests)
currentManager.StopProcesses()
currentManager.StopProcesses(proxy.StopWaitForInflightRequest)
// Now do a full shutdown to clear the process map
currentManager.Shutdown()
currentManager = newManager
+34 -3
View File
@@ -26,6 +26,8 @@ func main() {
silent := flag.Bool("silent", false, "disable all logging")
ignoreSigTerm := flag.Bool("ignore-sig-term", false, "ignore SIGTERM signal")
flag.Parse() // Parse the command-line flags
// Create a new Gin router
@@ -190,6 +192,10 @@ func main() {
log.SetOutput(io.Discard)
}
if !*silent {
fmt.Printf("My PID: %d\n", os.Getpid())
}
go func() {
log.Printf("simple-responder listening on %s\n", address)
// service connections
@@ -200,11 +206,36 @@ func main() {
// Wait for interrupt signal to gracefully shutdown the server with
// a timeout of 5 seconds.
quit := make(chan os.Signal, 1)
sigChan := make(chan os.Signal, 1)
// kill (no param) default send syscall.SIGTERM
// kill -2 is syscall.SIGINT
// kill -9 is syscall.SIGKILL but can't be catch, so don't need add it
signal.Notify(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
countSigInt := 0
runloop:
for {
signal := <-sigChan
switch signal {
case syscall.SIGINT:
countSigInt++
if countSigInt > 1 {
break runloop
} else {
log.Println("Recieved SIGINT, send another SIGINT to shutdown")
}
case syscall.SIGTERM:
if *ignoreSigTerm {
log.Println("Ignoring SIGTERM")
} else {
log.Println("Recieved SIGTERM, shutting down")
break runloop
}
default:
break runloop
}
}
log.Println("simple-responder shutting down")
}
+3
View File
@@ -23,6 +23,9 @@ type ModelConfig struct {
UnloadAfter int `yaml:"ttl"`
Unlisted bool `yaml:"unlisted"`
UseModelName string `yaml:"useModelName"`
// Limit concurrency of HTTP requests to process
ConcurrencyLimit int `yaml:"concurrencyLimit"`
}
func (m *ModelConfig) SanitizedCommand() ([]string, error) {
+6 -2
View File
@@ -48,14 +48,18 @@ func getSimpleResponderPath() string {
return filepath.Join("..", "build", fmt.Sprintf("simple-responder_%s_%s", goos, goarch))
}
func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
func getTestPort() int {
portMutex.Lock()
defer portMutex.Unlock()
port := nextTestPort
nextTestPort++
return getTestSimpleResponderConfigPort(expectedMessage, port)
return port
}
func getTestSimpleResponderConfig(expectedMessage string) ModelConfig {
return getTestSimpleResponderConfigPort(expectedMessage, getTestPort())
}
func getTestSimpleResponderConfigPort(expectedMessage string, port int) ModelConfig {
+55 -3
View File
@@ -30,6 +30,13 @@ const (
StateShutdown ProcessState = ProcessState("shutdown")
)
type StopStrategy int
const (
StopImmediately StopStrategy = iota
StopWaitForInflightRequest
)
type Process struct {
ID string
config ModelConfig
@@ -57,10 +64,22 @@ type Process struct {
// for managing shutdown state
shutdownCtx context.Context
shutdownCancel context.CancelFunc
// for managing concurrency limits
concurrencyLimitSemaphore chan struct{}
// stop timeout waiting for graceful shutdown
gracefulStopTimeout time.Duration
}
func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
ctx, cancel := context.WithCancel(context.Background())
concurrentLimit := 10
if config.ConcurrencyLimit > 0 {
concurrentLimit = config.ConcurrencyLimit
} else {
proxyLogger.Debugf("Concurrency limit for model %s not set, defaulting to 10", ID)
}
return &Process{
ID: ID,
config: config,
@@ -73,6 +92,12 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
state: StateStopped,
shutdownCtx: ctx,
shutdownCancel: cancel,
// concurrency limit
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
// stop timeout
gracefulStopTimeout: 5 * time.Second,
}
}
@@ -189,6 +214,12 @@ func (p *Process) start() error {
go func() {
exitErr := p.cmd.Wait()
p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
// there is a race condition when SIGKILL is used, p.cmd.Wait() returns, and then
// the code below fires, putting an error into cmdWaitChan. This code is to prevent this
if exitErr != nil && exitErr.Error() == "signal: killed" {
return
}
p.cmdWaitChan <- exitErr
}()
@@ -301,13 +332,25 @@ func (p *Process) start() error {
}
}
// Stop will wait for inflight requests to complete before stopping the process.
func (p *Process) Stop() {
if !isValidTransition(p.CurrentState(), StateStopping) {
return
}
// wait for any inflight requests before proceeding
p.proxyLogger.Debugf("<%s> Stop(): Waiting for inflight requests to complete", p.ID)
p.inFlightRequests.Wait()
p.StopImmediately()
}
// StopImmediately will transition the process to the stopping state and stop the process with a SIGTERM.
// If the process does not stop within the specified timeout, it will be forcefully stopped with a SIGKILL.
func (p *Process) StopImmediately() {
if !isValidTransition(p.CurrentState(), StateStopping) {
return
}
p.proxyLogger.Debugf("<%s> Stopping process", p.ID)
// calling Stop() when state is invalid is a no-op
@@ -317,7 +360,7 @@ func (p *Process) Stop() {
}
// stop the process with a graceful exit timeout
p.stopCommand(5 * time.Second)
p.stopCommand(p.gracefulStopTimeout)
if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
p.proxyLogger.Infof("<%s> Stop() StateStopping -> StateStopped err: %v, current state: %v", p.ID, err, curState)
@@ -326,10 +369,11 @@ func (p *Process) Stop() {
// Shutdown is called when llama-swap is shutting down. It will give a little bit
// of time for any inflight requests to complete before shutting down. If the Process
// is in the state of starting, it will cancel it and shut it down
// is in the state of starting, it will cancel it and shut it down. Once a process is in
// the StateShutdown state, it can not be started again.
func (p *Process) Shutdown() {
p.shutdownCancel()
p.stopCommand(5 * time.Second)
p.stopCommand(p.gracefulStopTimeout)
p.state = StateShutdown
}
@@ -417,6 +461,14 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
return
}
select {
case p.concurrencyLimitSemaphore <- struct{}{}:
defer func() { <-p.concurrencyLimitSemaphore }()
default:
http.Error(w, "Too many requests", http.StatusTooManyRequests)
return
}
p.inFlightRequests.Add(1)
defer func() {
p.lastRequestHandled = time.Now()
+103
View File
@@ -340,3 +340,106 @@ func TestProcess_ExitInterruptsHealthCheck(t *testing.T) {
assert.Equal(t, "upstream command exited prematurely but successfully", err.Error())
assert.Equal(t, process.CurrentState(), StateFailed)
}
func TestProcess_ConcurrencyLimit(t *testing.T) {
if testing.Short() {
t.Skip("skipping long concurrency limit test")
}
expectedMessage := "concurrency_limit_test"
config := getTestSimpleResponderConfig(expectedMessage)
// only allow 1 concurrent request at a time
config.ConcurrencyLimit = 1
process := NewProcess("ttl_test", 2, config, debugLogger, debugLogger)
assert.Equal(t, 1, cap(process.concurrencyLimitSemaphore))
defer process.Stop()
// launch a goroutine first to take up the semaphore
go func() {
req1 := httptest.NewRequest("GET", "/slow-respond?echo=12345&delay=75ms", nil)
w := httptest.NewRecorder()
process.ProxyRequest(w, req1)
assert.Equal(t, http.StatusOK, w.Code)
}()
// let the goroutine start
<-time.After(time.Millisecond * 25)
denied := httptest.NewRequest("GET", "/test", nil)
w := httptest.NewRecorder()
process.ProxyRequest(w, denied)
assert.Equal(t, http.StatusTooManyRequests, w.Code)
}
func TestProcess_StopImmediately(t *testing.T) {
expectedMessage := "test_stop_immediate"
config := getTestSimpleResponderConfig(expectedMessage)
process := NewProcess("stop_immediate", 2, config, debugLogger, debugLogger)
defer process.Stop()
err := process.start()
assert.Nil(t, err)
assert.Equal(t, process.CurrentState(), StateReady)
go func() {
// slow, but will get killed by StopImmediate
req := httptest.NewRequest("GET", "/slow-respond?echo=12345&delay=1s", nil)
w := httptest.NewRecorder()
process.ProxyRequest(w, req)
}()
<-time.After(time.Millisecond)
process.StopImmediately()
assert.Equal(t, process.CurrentState(), StateStopped)
}
// Test that SIGKILL is sent when gracefulStopTimeout is reached and properly terminates
// the upstream command
func TestProcess_ForceStopWithKill(t *testing.T) {
expectedMessage := "test_sigkill"
binaryPath := getSimpleResponderPath()
port := getTestPort()
config := ModelConfig{
// note --ignore-sig-term which ignores the SIGTERM signal so a SIGKILL must be sent
// to force the process to exit
Cmd: fmt.Sprintf("%s --port %d --respond %s --silent --ignore-sig-term", binaryPath, port, expectedMessage),
Proxy: fmt.Sprintf("http://127.0.0.1:%d", port),
CheckEndpoint: "/health",
}
process := NewProcess("stop_immediate", 2, config, debugLogger, debugLogger)
defer process.Stop()
// reduce to make testing go faster
process.gracefulStopTimeout = time.Second
err := process.start()
assert.Nil(t, err)
assert.Equal(t, process.CurrentState(), StateReady)
waitChan := make(chan struct{})
go func() {
// slow, but will get killed by StopImmediate
req := httptest.NewRequest("GET", "/slow-respond?echo=12345&delay=2s", nil)
w := httptest.NewRecorder()
process.ProxyRequest(w, req)
// StatusOK because that was already sent before the kill
assert.Equal(t, http.StatusOK, w.Code)
// unexpected EOF because the kill happened, the "1" is sent before the kill
// then the unexpected EOF is sent after the kill
assert.Equal(t, "1unexpected EOF\n", w.Body.String())
close(waitChan)
}()
<-time.After(time.Millisecond)
process.StopImmediately()
assert.Equal(t, process.CurrentState(), StateStopped)
// the request should have been interrupted by SIGKILL
<-waitChan
}
+7 -6
View File
@@ -76,14 +76,10 @@ func (pg *ProcessGroup) HasMember(modelName string) bool {
return slices.Contains(pg.config.Groups[pg.id].Members, modelName)
}
func (pg *ProcessGroup) StopProcesses() {
func (pg *ProcessGroup) StopProcesses(strategy StopStrategy) {
pg.Lock()
defer pg.Unlock()
pg.stopProcesses()
}
// stopProcesses stops all processes in the group
func (pg *ProcessGroup) stopProcesses() {
if len(pg.processes) == 0 {
return
}
@@ -94,7 +90,12 @@ func (pg *ProcessGroup) stopProcesses() {
wg.Add(1)
go func(process *Process) {
defer wg.Done()
process.Stop()
switch strategy {
case StopImmediately:
process.StopImmediately()
default:
process.Stop()
}
}(process)
}
wg.Wait()
+2 -2
View File
@@ -46,7 +46,7 @@ func TestProcessGroup_HasMember(t *testing.T) {
func TestProcessGroup_ProxyRequestSwapIsTrue(t *testing.T) {
pg := NewProcessGroup("G1", processGroupTestConfig, testLogger, testLogger)
defer pg.StopProcesses()
defer pg.StopProcesses(StopWaitForInflightRequest)
tests := []string{"model1", "model2"}
@@ -74,7 +74,7 @@ func TestProcessGroup_ProxyRequestSwapIsTrue(t *testing.T) {
func TestProcessGroup_ProxyRequestSwapIsFalse(t *testing.T) {
pg := NewProcessGroup("G2", processGroupTestConfig, testLogger, testLogger)
defer pg.StopProcesses()
defer pg.StopProcesses(StopWaitForInflightRequest)
tests := []string{"model3", "model4"}
+4 -4
View File
@@ -208,7 +208,7 @@ func (pm *ProxyManager) ServeHTTP(w http.ResponseWriter, r *http.Request) {
// This is the public method safe for concurrent calls.
// Unlike Shutdown, this method only stops the processes but doesn't perform
// a complete shutdown, allowing for process replacement without full termination.
func (pm *ProxyManager) StopProcesses() {
func (pm *ProxyManager) StopProcesses(strategy StopStrategy) {
pm.Lock()
defer pm.Unlock()
@@ -218,7 +218,7 @@ func (pm *ProxyManager) StopProcesses() {
wg.Add(1)
go func(processGroup *ProcessGroup) {
defer wg.Done()
processGroup.stopProcesses()
processGroup.StopProcesses(strategy)
}(processGroup)
}
@@ -260,7 +260,7 @@ func (pm *ProxyManager) swapProcessGroup(requestedModel string) (*ProcessGroup,
pm.proxyLogger.Debugf("Exclusive mode for group %s, stopping other process groups", processGroup.id)
for groupId, otherGroup := range pm.processGroups {
if groupId != processGroup.id && !otherGroup.persistent {
otherGroup.StopProcesses()
otherGroup.StopProcesses(StopWaitForInflightRequest)
}
}
}
@@ -504,7 +504,7 @@ func (pm *ProxyManager) sendErrorResponse(c *gin.Context, statusCode int, messag
}
func (pm *ProxyManager) unloadAllModelsHandler(c *gin.Context) {
pm.StopProcesses()
pm.StopProcesses(StopImmediately)
c.String(http.StatusOK, "OK")
}
+11 -11
View File
@@ -27,7 +27,7 @@ func TestProxyManager_SwapProcessCorrectly(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
for _, modelName := range []string{"model1", "model2"} {
reqBody := fmt.Sprintf(`{"model":"%s"}`, modelName)
@@ -63,7 +63,7 @@ func TestProxyManager_SwapMultiProcess(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
tests := []string{"model1", "model2"}
for _, requestedModel := range tests {
@@ -105,7 +105,7 @@ func TestProxyManager_PersistentGroupsAreNotSwapped(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
// make requests to load all models, loading model1 should not affect model2
tests := []string{"model2", "model1"}
@@ -141,7 +141,7 @@ func TestProxyManager_SwapMultiProcessParallelRequests(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
results := map[string]string{}
@@ -339,7 +339,7 @@ func TestProxyManager_RunningEndpoint(t *testing.T) {
"model1": getTestSimpleResponderConfig("model1"),
"model2": getTestSimpleResponderConfig("model2"),
},
LogLevel: "debug",
LogLevel: "warn",
})
// Define a helper struct to parse the JSON response.
@@ -352,7 +352,7 @@ func TestProxyManager_RunningEndpoint(t *testing.T) {
// Create proxy once for all tests
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
t.Run("no models loaded", func(t *testing.T) {
req := httptest.NewRequest("GET", "/running", nil)
@@ -407,7 +407,7 @@ func TestProxyManager_AudioTranscriptionHandler(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
// Create a buffer with multipart form data
var b bytes.Buffer
@@ -461,7 +461,7 @@ func TestProxyManager_UseModelName(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
requestedModel := "model1"
@@ -557,7 +557,7 @@ func TestProxyManager_CORSOptionsHandler(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
req := httptest.NewRequest(tt.method, "/v1/chat/completions", nil)
for k, v := range tt.requestHeaders {
@@ -586,7 +586,7 @@ func TestProxyManager_Upstream(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
req := httptest.NewRequest("GET", "/upstream/model1/test", nil)
rec := httptest.NewRecorder()
proxy.ServeHTTP(rec, req)
@@ -604,7 +604,7 @@ func TestProxyManager_ChatContentLength(t *testing.T) {
})
proxy := New(config)
defer proxy.StopProcesses()
defer proxy.StopProcesses(StopWaitForInflightRequest)
reqBody := fmt.Sprintf(`{"model":"%s", "x": "this is just some content to push the length out a bit"}`, "model1")
req := httptest.NewRequest("POST", "/v1/chat/completions", bytes.NewBufferString(reqBody))