ui: improve manual model load and cancel (#847)

- When a model is manually loaded show a cancel buttton and a queued
status
- Implement cancellation in scheduler.Scheduler interface and FIFO
scheduler
- Add cache bust query parameter to bypass browser cache

Fixes #844
This commit is contained in:
Benson Wong
2026-06-14 13:38:10 -07:00
committed by GitHub
parent 92b90447e8
commit ed77385d08
7 changed files with 193 additions and 6 deletions
+14
View File
@@ -54,6 +54,7 @@ type baseRouter struct {
procCancel context.CancelFunc
handlerCh chan scheduler.HandlerReq
cancelCh chan scheduler.HandlerReq
shutdownCh chan shutdownReq
unloadCh chan unloadReq
swapDoneCh chan scheduler.SwapDone
@@ -88,6 +89,7 @@ func newBaseRouter(
procCtx: procCtx,
procCancel: procCancel,
handlerCh: make(chan scheduler.HandlerReq),
cancelCh: make(chan scheduler.HandlerReq),
shutdownCh: make(chan shutdownReq),
unloadCh: make(chan unloadReq),
swapDoneCh: make(chan scheduler.SwapDone),
@@ -117,6 +119,10 @@ func (b *baseRouter) run() {
b.schedule.OnRequest(req)
b.notifyProcessed()
case req := <-b.cancelCh:
b.schedule.OnCancel(req)
b.notifyProcessed()
case req := <-b.unloadCh:
b.schedule.OnUnload(req.targets, req.timeout)
close(req.respond)
@@ -473,6 +479,14 @@ func (b *baseRouter) ServeHTTP(w http.ResponseWriter, req *http.Request) {
finishLoading()
case <-req.Context().Done():
finishLoading()
// Notify the scheduler so it can prune this request from its queue
// and swap waiters. Without this, a queued request whose client left
// would sit in the scheduler until drainQueue eventually starts a
// wasted model load for it.
select {
case b.cancelCh <- hr:
case <-b.shutdownCtx.Done():
}
return
case <-b.shutdownCtx.Done():
finishLoading()
+40
View File
@@ -116,6 +116,46 @@ func (s *FIFO) OnRequest(req HandlerReq) {
s.startSwap(req, evict, running)
}
// OnCancel removes a request whose client has disconnected from the queue and
// from every in-flight swap's waiters. If the request was the sole waiter of an
// active swap, the swap goroutine is left to complete on its own — OnSwapDone
// will find no waiters and simply clean up. This prevents drainQueue from ever
// starting a model load for a caller that is no longer there.
func (s *FIFO) OnCancel(req HandlerReq) {
removed := false
// Prune from the queue.
if len(s.queued) > 0 {
kept := s.queued[:0]
for _, q := range s.queued {
if q.Respond == req.Respond {
removed = true
continue
}
kept = append(kept, q)
}
s.queued = kept
}
// Prune from any active swap's waiters.
for _, sw := range s.active {
filtered := sw.waiters[:0]
for _, w := range sw.waiters {
if w.Respond == req.Respond {
removed = true
continue
}
filtered = append(filtered, w)
}
sw.waiters = filtered
}
if removed {
s.logger.Debugf("%s: cancelled request for model %s pruned from scheduler", s.name, req.Model)
broadcastQueuePositions(s.queued)
}
}
// OnSwapDone fans the result out to every waiter that joined this swap, removes
// the swap from the active map, then walks the queue once, promoting any items
// that no longer collide with the remaining active set. FIFO order is preserved:
+96
View File
@@ -143,6 +143,15 @@ func newFIFO(planner Swapper, eff Effects) *FIFO {
func req(model string) HandlerReq { return HandlerReq{Model: model} }
// reqCh creates a HandlerReq with a unique Respond channel so OnCancel can
// identify it among queued requests and swap waiters.
func reqCh(model string) HandlerReq {
return HandlerReq{
Model: model,
Respond: make(chan HandlerResp, 1),
}
}
func TestFIFO_FastPath(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateReady
@@ -535,3 +544,90 @@ func TestFIFO_PriorityQueueOrder(t *testing.T) {
}
}
}
// TestFIFO_OnCancel_QueuedRequest verifies that cancelling a queued request
// prevents drainQueue from ever starting a model load for it. Without OnCancel
// the dead request would sit in the queue until a drain triggers a wasted swap.
func TestFIFO_OnCancel_QueuedRequest(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
eff.states["b"] = process.StateStopped
// b evicts a, so a request for b queues while a is loading.
s := newFIFO(&stubPlanner{evict: map[string][]string{"b": {"a"}}}, eff)
s.OnRequest(req("a")) // StartSwap(a)
cancelledReq := reqCh("b")
s.OnRequest(cancelledReq) // queued (collides with a's in-flight swap)
if len(s.queued) != 1 {
t.Fatalf("queue len=%d want 1 before cancel", len(s.queued))
}
// Client disconnects.
s.OnCancel(cancelledReq)
if len(s.queued) != 0 {
t.Fatalf("queue len=%d want 0 after cancel", len(s.queued))
}
// a's swap finishes; drainQueue runs but b is gone — no swap for b.
eff.states["a"] = process.StateReady
s.OnSwapDone(SwapDone{ModelID: "a"})
if got := eff.startsFor("b"); got != 0 {
t.Errorf("StartSwap(b)=%d want 0 (cancelled request should not trigger a load)", got)
}
}
// TestFIFO_OnCancel_SwapWaiter verifies that cancelling a request that joined an
// in-flight swap removes it from the waiter list. When the swap completes, the
// cancelled waiter receives no grant and does not bump the in-flight count.
func TestFIFO_OnCancel_SwapWaiter(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateStopped
s := newFIFO(&stubPlanner{}, eff)
liveReq := reqCh("a")
cancelledReq := reqCh("a")
s.OnRequest(liveReq) // starts swap
s.OnRequest(cancelledReq) // joins
if sw := s.active["a"]; len(sw.waiters) != 2 {
t.Fatalf("waiters=%d want 2", len(sw.waiters))
}
s.OnCancel(cancelledReq)
if sw := s.active["a"]; len(sw.waiters) != 1 {
t.Fatalf("waiters=%d want 1 after cancel", len(sw.waiters))
}
// Swap finishes: only the live waiter is granted.
eff.states["a"] = process.StateReady
s.OnSwapDone(SwapDone{ModelID: "a"})
if got := eff.served("a"); got != 1 {
t.Errorf("served(a)=%d want 1 (only the non-cancelled waiter)", got)
}
}
// TestFIFO_OnCancel_NotPresent is a no-op: cancelling a request that was already
// granted (and is no longer queued or waiting) must not affect anything.
func TestFIFO_OnCancel_NotPresent(t *testing.T) {
eff := newFakeEffects()
eff.states["a"] = process.StateReady
s := newFIFO(&stubPlanner{}, eff)
r := reqCh("a")
s.OnRequest(r) // fast-path served immediately
// Cancel after grant — should be a harmless no-op.
s.OnCancel(r)
if got := eff.served("a"); got != 1 {
t.Errorf("served(a)=%d want 1 (cancel of granted request is a no-op)", got)
}
if len(s.queued) != 0 {
t.Errorf("queue should be empty, len=%d", len(s.queued))
}
}
+5
View File
@@ -47,6 +47,11 @@ type Swapper interface {
type Scheduler interface {
// OnRequest handles one incoming ServeHTTP request.
OnRequest(req HandlerReq)
// OnCancel handles a request whose client has disconnected before it was
// granted. The scheduler must remove the request from its queue and from
// any in-flight swap's waiters so it never triggers a model load or grant
// for a caller that is no longer there.
OnCancel(req HandlerReq)
// OnSwapDone handles a swap goroutine reporting completion.
OnSwapDone(ev SwapDone)
// OnServeDone handles a tracked ServeHTTP finishing (in-flight decrement).
+30 -3
View File
@@ -6,6 +6,8 @@
let isUnloading = $state(false);
let menuOpen = $state(false);
let pendingLoads = $state<Record<string, boolean>>({});
const loadControllers = new Map<string, AbortController>();
const showUnlistedStore = persistentStore<boolean>("showUnlisted", true);
const showIdorNameStore = persistentStore<"id" | "name">("showIdorName", "id");
@@ -42,6 +44,25 @@
}
}
async function handleLoadModel(modelId: string): Promise<void> {
if (pendingLoads[modelId]) return;
const controller = new AbortController();
loadControllers.set(modelId, controller);
pendingLoads[modelId] = true;
try {
await loadModel(modelId, controller.signal);
} catch (e) {
console.error(e);
} finally {
loadControllers.delete(modelId);
delete pendingLoads[modelId];
}
}
function cancelLoad(modelId: string): void {
loadControllers.get(modelId)?.abort();
}
function toggleIdorName(): void {
showIdorNameStore.update((prev) => (prev === "name" ? "id" : "name"));
}
@@ -170,14 +191,20 @@
{/if}
</td>
<td class="w-12">
{#if model.state === "stopped"}
<button class="btn btn--sm" onclick={() => loadModel(model.id)}>Load</button>
{#if model.state === "stopped" && pendingLoads[model.id]}
<button class="btn btn--sm" onclick={() => cancelLoad(model.id)}>Cancel</button>
{:else if model.state === "stopped"}
<button class="btn btn--sm" onclick={() => handleLoadModel(model.id)}>Load</button>
{:else}
<button class="btn btn--sm" onclick={() => unloadSingleModel(model.id)} disabled={model.state !== "ready"}>Unload</button>
{/if}
</td>
<td class="w-20">
<span class="w-16 text-center status status--{model.state}">{model.state}</span>
{#if model.state === "stopped" && pendingLoads[model.id]}
<span class="w-16 text-center status status--queued">queued</span>
{:else}
<span class="w-16 text-center status status--{model.state}">{model.state}</span>
{/if}
</td>
</tr>
{/each}
+2 -1
View File
@@ -139,7 +139,8 @@
}
.status--starting,
.status--stopping {
.status--stopping,
.status--queued {
@apply bg-warning/10 text-warning;
}
+6 -2
View File
@@ -176,15 +176,19 @@ export async function unloadSingleModel(model: string): Promise<void> {
}
}
export async function loadModel(model: string): Promise<void> {
export async function loadModel(model: string, signal?: AbortSignal): Promise<void> {
try {
const response = await fetch(`/upstream/${model}/`, {
const response = await fetch(`/upstream/${model}/?_=${Date.now()}`, {
method: "GET",
signal,
});
if (!response.ok) {
throw new Error(`Failed to load model: ${response.status}`);
}
} catch (error) {
if (error instanceof DOMException && error.name === "AbortError") {
return;
}
console.error("Failed to load model:", error);
throw error;
}