P1: model layer (convar->config inversion) + llmmeta

Lifts mort's pkg/logic/llms into executus/model, decoupled from mort: - tiers.go: the tier resolver now reads a host-supplied config.Source under "model.tier.<name>" with host-supplied fallbacks (Configure(cfg, defaults, ttl)), instead of convar.Manager. Tier NAMES + specs are host config; the resolution mechanism (cache, reasoning-suffix dialect, chain validation) is generic. No tier names hard-coded in the harness. - sink.go: usage/trace recording inverted off mort's llmusage/llmtrace into UsageSink / TraceSink seams + a model-owned Span, with nil-safe context attribution helpers (WithModel/WithTraceID/WithUsageTool/WithUsageUser). Both sinks optional (nil = off) so a light host records nothing. - lane decoration repointed to executus/lane; utils.Errorf -> fmt.Errorf. - call.go keeps GenerateWith[T] (instrumented structured output) — this is the structured-output primitive; no separate structured/ package. - llmmeta moved over model/ (the meta-LLM helper: tier allowlist + JSON retry + ledger). Its tests configure a minimal tier table via TestMain. New tests cover the inversion: config overrides fallback, tier registration, reasoning-suffix survival, nested-tier rejection, nil-sink no-ops. Full module: go build/vet/test -race green; core go.sum still free of gorm/redis/discordgo/sqlite. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 19:47:13 -04:00
parent 741d7816ed
commit b424261aca
17 changed files with 3698 additions and 3 deletions
@@ -48,11 +48,12 @@ CORE (majordomo + stdlib):
  dispatchguard/  loop/depth/fan-out caps                [P0 ✓]
  pendingattach/  attachment dedupe                      [P0 ✓]
  tool/     registry + 3-stage permissions + ssrf        [P1 ✓]
-  model/    config-driven tier resolution over majordomo [P1]
-  llmmeta/  shared meta-LLM helper (moves with model/)    [P1]
+  model/    config-driven tier resolution over majordomo [P1 ✓]
+            (convar->config.Source; UsageSink/TraceSink seams; GenerateWith[T]
+             structured output — no separate structured/ pkg)
+  llmmeta/  shared meta-LLM helper over model/            [P1 ✓]
  compact/  context compactor (WithCompactor hook)       [P2]
  tools/{web,net,store,compose,meta,comms}  generic tools [P3]
-  structured/  Generate[T] convenience over majordomo    [P1]

 BATTERIES (opt-in siblings, each nil-safe + a default):
  persona/   Agent noun + AgentStore seam + yml loader   [P4]
@@ -4,5 +4,25 @@ go 1.26.2

 require (
 	gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260626223738-1fd7109a42f3
+	github.com/google/uuid v1.6.0
 	golang.org/x/crypto v0.53.0
 )
+
+require (
+	cloud.google.com/go v0.116.0 // indirect
+	cloud.google.com/go/auth v0.9.3 // indirect
+	cloud.google.com/go/compute/metadata v0.5.0 // indirect
+	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
+	github.com/google/go-cmp v0.6.0 // indirect
+	github.com/google/s2a-go v0.1.8 // indirect
+	github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
+	github.com/gorilla/websocket v1.5.3 // indirect
+	go.opencensus.io v0.24.0 // indirect
+	golang.org/x/net v0.55.0 // indirect
+	golang.org/x/sys v0.46.0 // indirect
+	golang.org/x/text v0.38.0 // indirect
+	google.golang.org/genai v1.59.0 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect
+	google.golang.org/grpc v1.66.2 // indirect
+	google.golang.org/protobuf v1.34.2 // indirect
+)
@@ -1,4 +1,130 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+cloud.google.com/go v0.116.0 h1:B3fRrSDkLRt5qSHWe40ERJvhvnQwdZiHu0bJOpldweE=
+cloud.google.com/go v0.116.0/go.mod h1:cEPSRWPzZEswwdr9BxE6ChEn01dWlTaF05LiC2Xs70U=
+cloud.google.com/go/auth v0.9.3 h1:VOEUIAADkkLtyfr3BLa3R8Ed/j6w1jTBmARx+wb5w5U=
+cloud.google.com/go/auth v0.9.3/go.mod h1:7z6VY+7h3KUdRov5F1i8NDP5ZzWKYmEPO842BgCsmTk=
+cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY=
+cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY=
 gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260626223738-1fd7109a42f3 h1:KYKIFFRsXzbbBJVDa99+Fhy0zxl9G0xV/MCrLipsLL4=
 gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260626223738-1fd7109a42f3/go.mod h1:UZLveG17SmENt4sne2RSLIbioix30RZbRIQUzBAnOyY=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
+github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM=
+github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA=
+github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw=
+github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA=
+github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
+github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
+go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto=
 golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8=
+golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM=
+golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw=
+golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE=
+golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/genai v1.59.0 h1:xp+ydkJFW8hO0hTUaAkr8TrLM9HFP3NYAwFhPd0nDqA=
+google.golang.org/genai v1.59.0/go.mod h1:mDdPDFXo1Ats7f1WXVyZgWb/CkMzFWTWJruIMy7hGIU=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
+google.golang.org/grpc v1.66.2 h1:3QdXkuq3Bkh7w+ywLdLvM56cmGvQHUMZpiCzt6Rqaoo=
+google.golang.org/grpc v1.66.2/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
+google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
+google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
@@ -0,0 +1,615 @@
+// Package llmmeta is the shared meta-LLM helper used by the v12
+// authoring tools (summarize, translate, extract_entities, classify).
+//
+// Why a dedicated package: each of those four tools makes "one fast-tier
+// LLM call → typed result", with shared concerns (tier allowlist,
+// ledger row, JSON-retry on malformed output). Centralising the pattern
+// stops every tool from re-implementing the surrounding bookkeeping and
+// keeps the audit trail uniform.
+//
+// The helper itself does NOT know about the four tools — it just exposes
+// a Call(ctx, CallSpec) → CallResult shape. Each tool builds its own
+// prompt + parses the typed result. The helper records the meta-call
+// ledger row on every call, success or failure.
+//
+// Concurrency / lanes: the helper resolves the tier to an llm.Model via
+// model.ParseModelForContext and uses model.Generate. Lane routing is
+// already baked in at the LLM transport layer (see
+// pkg/logic/llms/lane_transport.go) so each Generate call automatically
+// goes through the right lane without further plumbing. Usage recording
+// is automatic too: parsed models are instrumented by pkg/logic/llms,
+// so the helper does NOT call model.RecordUsage itself.
+//
+// Tier allowlist: convar `skills.llm_meta.allowed_tiers` (default
+// `["fast"]`) controls which tiers a meta-tool may use. A request for
+// a disallowed tier returns error_kind="tier_not_allowed" WITHOUT
+// making the call AND WITHOUT recording a ledger row (the call did
+// not happen).
+//
+// Test: helper_test.go covers tier allowed, tier rejected, JSON
+// retry path, malformed-twice path, and ledger-row emission semantics.
+package llmmeta
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"time"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"github.com/google/uuid"
+
+	"gitea.stevedudenhoeffer.com/steve/executus/model"
+)
+
+// MetaCall is the domain row written to skill_llm_meta_calls on every
+// helper call.
+//
+// Why a dedicated table (not skill_run_logs): per-skill token
+// aggregation is cleaner with typed columns. Folding meta-calls into
+// the generic event log would force a SUM-from-JSON path on every
+// dashboard query.
+//
+// Why the field set is tight (no payload columns): the request bodies
+// can be 32KB+. The agent's main run already captures system_prompt
+// + user_message in the trace; storing them again here would double
+// the audit footprint with no diagnostic value (the meta-call's
+// inputs are derivable from the parent run's tool-call args).
+type MetaCall struct {
+	ID           string
+	RunID        string
+	SkillID      string
+	ToolName     string
+	TierUsed     string // "fast" / "standard"
+	ModelUsed    string // resolved provider/model
+	InputTokens  int
+	OutputTokens int
+	DurationMs   int
+	Success      bool
+	ErrorKind    string // empty on success; one of the sentinel kinds otherwise
+	CreatedAt    time.Time
+}
+
+// Storage is the narrow surface the helper uses to persist meta-call
+// ledger rows. Production wires a thin adapter around the skills GORM
+// storage; tests substitute a fake.
+//
+// Why an interface (vs depending on pkg/logic/skills.Storage): the
+// skills package imports skilltools (tool registry); having
+// skilltools/llmmeta depend back on skills would form an import
+// cycle. A narrow interface mirrored across the boundary is the
+// project's standard cycle-break pattern (see KVStorage / FileStorage
+// in pkg/skilltools/tools/).
+type Storage interface {
+	RecordMetaCall(ctx context.Context, call MetaCall) error
+}
+
+// ConvarReader is the narrow surface the helper uses to read
+// `skills.llm_meta.allowed_tiers`. The convar package is database-
+// backed; tests pass a static fake.
+//
+// Why an interface (vs reading convars directly): unit tests want to
+// fake the allowlist without spinning up a convar manager.
+type ConvarReader interface {
+	// AllowedTiers returns the list of tier names a meta-tool may use.
+	// Default ["fast"].
+	AllowedTiers(ctx context.Context) []string
+}
+
+// ConvarReaderFunc adapts a closure into a ConvarReader. Useful in
+// production wiring (mort.go) where the underlying access is a
+// single line of logic.
+type ConvarReaderFunc func(ctx context.Context) []string
+
+// AllowedTiers satisfies ConvarReader.
+func (f ConvarReaderFunc) AllowedTiers(ctx context.Context) []string {
+	if f == nil {
+		return []string{"fast"}
+	}
+	return f(ctx)
+}
+
+// Helper makes one fast-tier LLM call with surrounding bookkeeping
+// (tier allowlist, JSON retry, ledger row).
+//
+// Construct once at boot; all four meta-tools share the same Helper.
+type Helper struct {
+	storage Storage
+	convars ConvarReader
+}
+
+// New constructs a Helper. storage MUST be non-nil; passing nil makes
+// every Call write a no-op ledger row (callers that need a fully no-op
+// helper should instead avoid registering the tool).
+//
+// convars may be nil — the helper falls back to the default allowlist
+// `["fast"]`.
+//
+// Why a constructor with explicit deps (vs Helper{...} struct
+// initialiser): forces the deployment-time decision about which
+// dependencies are wired vs nil-safe at the construction call site,
+// not at the call site of each tool.
+func New(storage Storage, convars ConvarReader) *Helper {
+	return &Helper{
+		storage: storage,
+		convars: convars,
+	}
+}
+
+// CallSpec is the per-call input.
+//
+// Why every field is explicit (vs builder pattern): the four meta-tools
+// each populate the spec in one place; a struct literal at the call
+// site is more readable than chained setters.
+type CallSpec struct {
+	// Tier is the tier alias to use ("fast" / "standard"). Empty falls
+	// back to "fast". Disallowed tiers (per the convar allowlist) cause
+	// Call to return CallResult{Success: false, ErrorKind:
+	// "tier_not_allowed"} WITHOUT making the LLM call AND without
+	// writing a ledger row (the call did not happen).
+	Tier string
+
+	// SystemPrompt is the system message. May be empty.
+	SystemPrompt string
+
+	// UserPrompt is the user message. Required.
+	UserPrompt string
+
+	// MaxOutputTokens caps the response. 0 disables the cap (provider
+	// default). The helper uses this both to bound the cost estimate
+	// AND to set llm.WithMaxTokens on the request.
+	MaxOutputTokens int
+
+	// ResponseFormat is "text" or "json". When "json", the helper
+	// attempts to parse the response into JSON. Other values fall
+	// through as "text".
+	ResponseFormat string
+
+	// RetryOnMalformedJSON, when true and ResponseFormat=="json",
+	// retries the call ONCE with a stricter JSON-only prompt prefix
+	// when the first response fails to parse. Second-failure returns
+	// CallResult{Success: true, Parsed: nil, ErrorKind:
+	// "malformed_json"} so callers can fall back to result.Text.
+	RetryOnMalformedJSON bool
+
+	// ToolName is the meta-tool name recorded in the ledger row
+	// ("summarize", "translate", "extract_entities", "classify"). The
+	// helper does not branch on this value.
+	ToolName string
+
+	// RunID is the calling skill run ID. Recorded in the ledger row;
+	// also used by the cost-cap callback to find the running 7-day
+	// total.
+	RunID string
+
+	// SkillID is the calling skill ID. Recorded in the ledger row;
+	// passed to the cost-cap callback.
+	SkillID string
+
+	// CallerID is the Discord member ID that triggered the parent
+	// skill run. Passed to the cost-cap callback so the per-user
+	// 7-day cap can be evaluated.
+	CallerID string
+}
+
+// CallResult is the per-call output.
+//
+// Why text + parsed (vs only one): JSON-format calls expose both the
+// raw response (in .Text) and the parsed map (in .Parsed). Text-format
+// calls leave .Parsed nil. Callers requesting JSON that fails to parse
+// twice get .Text populated and ErrorKind="malformed_json" so they
+// can fall back to text-mode without an error path.
+type CallResult struct {
+	// Text is the raw response text from the LLM. Populated on every
+	// successful call (success=true) AND when JSON parsing failed
+	// twice (success=true, parsed=nil, error_kind="malformed_json").
+	// Empty on tier_not_allowed rejections (no LLM call happened).
+	Text string
+
+	// Parsed is the JSON-decoded response. nil for text-format calls,
+	// nil for failed JSON parses, populated for successful JSON
+	// responses. The interior shape is whatever the LLM returned; the
+	// caller is responsible for asserting a typed view.
+	Parsed any
+
+	// InputTokens is the tokens billed against the input. 0 when the
+	// provider didn't surface usage.
+	InputTokens int
+
+	// OutputTokens is the tokens billed against the output. 0 when the
+	// provider didn't surface usage.
+	OutputTokens int
+
+	// DurationMs is wall-clock duration of the LLM call (or call+retry
+	// in the JSON-retry case).
+	DurationMs int
+
+	// ModelUsed is the resolved provider/model string ("anthropic/
+	// claude-haiku-4-5-20251001"). Populated on every actual LLM call;
+	// empty on tier_not_allowed rejections.
+	ModelUsed string
+
+	// Success reports whether the LLM call returned a usable response.
+	// True on happy-path AND on malformed-json second-failure (the
+	// caller can fall back to .Text). False on transport errors,
+	// tier_not_allowed, llm_unavailable.
+	Success bool
+
+	// ErrorKind, when non-empty, is one of:
+	//   - "tier_not_allowed" → no call, no ledger row
+	//   - "llm_unavailable"  → call attempted, ledger row written
+	//   - "malformed_json"   → call succeeded but JSON parse failed
+	ErrorKind string
+}
+
+// Sentinel error_kind values for CallResult.ErrorKind.
+const (
+	ErrorKindTierNotAllowed = "tier_not_allowed"
+	ErrorKindLLMUnavailable = "llm_unavailable"
+	ErrorKindMalformedJSON  = "malformed_json"
+)
+
+// Call performs the meta-LLM call and returns a typed CallResult.
+//
+// Why no error return (vs an error second value): every meaningful
+// failure is captured as a CallResult.ErrorKind so the caller's branch
+// logic stays single-pathed. Internal transport errors are surfaced
+// as ErrorKind=llm_unavailable. The function only returns a non-nil
+// error for argument-validation failures (empty UserPrompt) — a
+// programmer error the caller would have to fix anyway.
+//
+// Test: helper_test.go covers all outcomes (tier_not_allowed, happy
+// text, happy json, malformed_json retry-pass, malformed_json
+// retry-fail, llm_unavailable).
+func (h *Helper) Call(ctx context.Context, spec CallSpec) (CallResult, error) {
+	if strings.TrimSpace(spec.UserPrompt) == "" {
+		return CallResult{}, fmt.Errorf("llmmeta: user_prompt required")
+	}
+	tier := strings.TrimSpace(spec.Tier)
+	if tier == "" {
+		tier = "fast"
+	}
+
+	// Tier allowlist: rejected tiers do NOT make the call AND do NOT
+	// record a ledger row.
+	if !h.tierAllowed(ctx, tier) {
+		return CallResult{
+			Success:   false,
+			ErrorKind: ErrorKindTierNotAllowed,
+		}, nil
+	}
+
+	resolvedModel := model.ResolveModelName(tier)
+
+	// Resolve model. ParseModelForContext attaches the resolved model
+	// name to ctx (for usage attribution) AND returns the llm.Model
+	// whose Generate already routes through the lane wrapper.
+	ctx, model, err := model.ParseModelForContext(ctx, tier)
+	if err != nil {
+		// Tier convar mis-set: surface as tier_not_allowed to the
+		// caller (the agent's recovery path is the same as for an
+		// admin-disabled tier) but DO record the failure for the
+		// admin who needs to fix the convar.
+		h.recordLedger(ctx, MetaCall{
+			ID:        uuid.NewString(),
+			RunID:     spec.RunID,
+			SkillID:   spec.SkillID,
+			ToolName:  spec.ToolName,
+			TierUsed:  tier,
+			ModelUsed: resolvedModel,
+			Success:   false,
+			ErrorKind: ErrorKindTierNotAllowed,
+			CreatedAt: time.Now(),
+		})
+		return CallResult{
+			Success:   false,
+			ErrorKind: ErrorKindTierNotAllowed,
+		}, nil
+	}
+
+	// First call.
+	start := time.Now()
+	systemPrompt := spec.SystemPrompt
+	userMessage := spec.UserPrompt
+	opts := []llm.Option{}
+	if spec.MaxOutputTokens > 0 {
+		opts = append(opts, llm.WithMaxTokens(spec.MaxOutputTokens))
+	}
+	text, usage, llmErr := h.complete(ctx, model, systemPrompt, userMessage, opts)
+	if llmErr != nil {
+		duration := int(time.Since(start) / time.Millisecond)
+		h.recordLedger(ctx, MetaCall{
+			ID:           uuid.NewString(),
+			RunID:        spec.RunID,
+			SkillID:      spec.SkillID,
+			ToolName:     spec.ToolName,
+			TierUsed:     tier,
+			ModelUsed:    resolvedModel,
+			InputTokens:  usage.InputTokens,
+			OutputTokens: usage.OutputTokens,
+			DurationMs:   duration,
+			Success:      false,
+			ErrorKind:    ErrorKindLLMUnavailable,
+			CreatedAt:    time.Now(),
+		})
+		return CallResult{
+			Success:      false,
+			ErrorKind:    ErrorKindLLMUnavailable,
+			ModelUsed:    resolvedModel,
+			DurationMs:   duration,
+			InputTokens:  usage.InputTokens,
+			OutputTokens: usage.OutputTokens,
+		}, nil
+	}
+
+	// Determine outcome based on response format.
+	parsed, parsedOK := tryParseJSON(text, spec.ResponseFormat)
+	wantJSON := strings.EqualFold(spec.ResponseFormat, "json")
+
+	if !wantJSON || parsedOK {
+		// Happy path (text mode OR JSON mode that parsed first try).
+		duration := int(time.Since(start) / time.Millisecond)
+		h.recordLedger(ctx, MetaCall{
+			ID:           uuid.NewString(),
+			RunID:        spec.RunID,
+			SkillID:      spec.SkillID,
+			ToolName:     spec.ToolName,
+			TierUsed:     tier,
+			ModelUsed:    resolvedModel,
+			InputTokens:  usage.InputTokens,
+			OutputTokens: usage.OutputTokens,
+			DurationMs:   duration,
+			Success:      true,
+			CreatedAt:    time.Now(),
+		})
+		return CallResult{
+			Text:         text,
+			Parsed:       parsed,
+			Success:      true,
+			ModelUsed:    resolvedModel,
+			InputTokens:  usage.InputTokens,
+			OutputTokens: usage.OutputTokens,
+			DurationMs:   duration,
+		}, nil
+	}
+
+	// JSON requested but first response failed to parse.
+	if !spec.RetryOnMalformedJSON {
+		duration := int(time.Since(start) / time.Millisecond)
+		h.recordLedger(ctx, MetaCall{
+			ID:           uuid.NewString(),
+			RunID:        spec.RunID,
+			SkillID:      spec.SkillID,
+			ToolName:     spec.ToolName,
+			TierUsed:     tier,
+			ModelUsed:    resolvedModel,
+			InputTokens:  usage.InputTokens,
+			OutputTokens: usage.OutputTokens,
+			DurationMs:   duration,
+			Success:      true,
+			ErrorKind:    ErrorKindMalformedJSON,
+			CreatedAt:    time.Now(),
+		})
+		return CallResult{
+			Text:         text,
+			Success:      true,
+			ErrorKind:    ErrorKindMalformedJSON,
+			ModelUsed:    resolvedModel,
+			InputTokens:  usage.InputTokens,
+			OutputTokens: usage.OutputTokens,
+			DurationMs:   duration,
+		}, nil
+	}
+
+	// Retry once with stricter JSON-only prompt prefix.
+	stricterPrompt := "Return ONLY valid JSON. No prose, no markdown fencing.\n\n" + userMessage
+	text2, usage2, llmErr2 := h.complete(ctx, model, systemPrompt, stricterPrompt, opts)
+	combinedUsage := Tokens{
+		InputTokens:  usage.InputTokens + usage2.InputTokens,
+		OutputTokens: usage.OutputTokens + usage2.OutputTokens,
+	}
+	duration := int(time.Since(start) / time.Millisecond)
+	if llmErr2 != nil {
+		// Retry call itself failed transport-wise. Record the round-
+		// trip tokens and surface llm_unavailable.
+		h.recordLedger(ctx, MetaCall{
+			ID:           uuid.NewString(),
+			RunID:        spec.RunID,
+			SkillID:      spec.SkillID,
+			ToolName:     spec.ToolName,
+			TierUsed:     tier,
+			ModelUsed:    resolvedModel,
+			InputTokens:  combinedUsage.InputTokens,
+			OutputTokens: combinedUsage.OutputTokens,
+			DurationMs:   duration,
+			Success:      false,
+			ErrorKind:    ErrorKindLLMUnavailable,
+			CreatedAt:    time.Now(),
+		})
+		return CallResult{
+			Text:         text,
+			Success:      false,
+			ErrorKind:    ErrorKindLLMUnavailable,
+			ModelUsed:    resolvedModel,
+			InputTokens:  combinedUsage.InputTokens,
+			OutputTokens: combinedUsage.OutputTokens,
+			DurationMs:   duration,
+		}, nil
+	}
+
+	parsed2, parsedOK2 := tryParseJSON(text2, "json")
+	if parsedOK2 {
+		h.recordLedger(ctx, MetaCall{
+			ID:           uuid.NewString(),
+			RunID:        spec.RunID,
+			SkillID:      spec.SkillID,
+			ToolName:     spec.ToolName,
+			TierUsed:     tier,
+			ModelUsed:    resolvedModel,
+			InputTokens:  combinedUsage.InputTokens,
+			OutputTokens: combinedUsage.OutputTokens,
+			DurationMs:   duration,
+			Success:      true,
+			CreatedAt:    time.Now(),
+		})
+		return CallResult{
+			Text:         text2,
+			Parsed:       parsed2,
+			Success:      true,
+			ModelUsed:    resolvedModel,
+			InputTokens:  combinedUsage.InputTokens,
+			OutputTokens: combinedUsage.OutputTokens,
+			DurationMs:   duration,
+		}, nil
+	}
+
+	// Second-failure path. Caller can fall back to result.Text.
+	h.recordLedger(ctx, MetaCall{
+		ID:           uuid.NewString(),
+		RunID:        spec.RunID,
+		SkillID:      spec.SkillID,
+		ToolName:     spec.ToolName,
+		TierUsed:     tier,
+		ModelUsed:    resolvedModel,
+		InputTokens:  combinedUsage.InputTokens,
+		OutputTokens: combinedUsage.OutputTokens,
+		DurationMs:   duration,
+		Success:      true,
+		ErrorKind:    ErrorKindMalformedJSON,
+		CreatedAt:    time.Now(),
+	})
+	return CallResult{
+		Text:         text2,
+		Success:      true,
+		ErrorKind:    ErrorKindMalformedJSON,
+		ModelUsed:    resolvedModel,
+		InputTokens:  combinedUsage.InputTokens,
+		OutputTokens: combinedUsage.OutputTokens,
+		DurationMs:   duration,
+	}, nil
+}
+
+// Tokens is the input/output token count returned by the LLM round-
+// trip. Mirrors llm.Usage's two cost-bearing fields. Exported so
+// downstream test code (the four meta-tools' tests, integration
+// tests) can use SetCompleteForTest.
+type Tokens struct {
+	InputTokens  int
+	OutputTokens int
+}
+
+// CompleteFn is the seam used by tests to fake the LLM round-trip
+// without spinning up a real provider. Exported for tests in other
+// packages (the four meta-tools live in pkg/skilltools/tools/).
+type CompleteFn func(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error)
+
+// completeOverride is set in tests via SetCompleteForTest. nil falls
+// back to the real model.Generate path.
+var completeOverride CompleteFn
+
+// complete is the actual LLM round-trip. Calls model.Generate (which
+// already routes through the lane transport wrapper) and returns the
+// text + usage + error.
+//
+// Why not call model.SimpleCall: SimpleCall doesn't surface Usage; we
+// need the input/output token counts for the ledger row.
+//
+// Usage attribution to the per-user / per-skill dashboards is handled
+// by the instrumented model that model.ParseModelForContext returns —
+// a manual model.RecordUsage here would double-count.
+func (h *Helper) complete(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error) {
+	if completeOverride != nil {
+		return completeOverride(ctx, model, systemPrompt, userMessage, opts)
+	}
+	req := llm.Request{
+		System:   systemPrompt,
+		Messages: []llm.Message{llm.UserText(userMessage)},
+	}
+	resp, err := model.Generate(ctx, req, opts...)
+	if err != nil {
+		return "", Tokens{}, err
+	}
+	usage := Tokens{
+		InputTokens:  resp.Usage.InputTokens,
+		OutputTokens: resp.Usage.OutputTokens,
+	}
+	return resp.Text(), usage, nil
+}
+
+// SetCompleteForTest installs a fake completer used by Call. Returns a
+// restore function that the test deferes to revert the override.
+//
+// Why exported (vs in a _test.go file): the four meta-tools' tests live
+// in pkg/skilltools/tools/, in a different package than the helper.
+// They need a way to fake the LLM without depending on a real model.
+func SetCompleteForTest(fn CompleteFn) func() {
+	prev := completeOverride
+	completeOverride = fn
+	return func() { completeOverride = prev }
+}
+
+// tierAllowed reports whether the given tier appears in the configured
+// allowlist. Empty allowlist defaults to ["fast"].
+func (h *Helper) tierAllowed(ctx context.Context, tier string) bool {
+	var allowed []string
+	if h.convars != nil {
+		allowed = h.convars.AllowedTiers(ctx)
+	}
+	if len(allowed) == 0 {
+		allowed = []string{"fast"}
+	}
+	for _, t := range allowed {
+		if strings.EqualFold(strings.TrimSpace(t), tier) {
+			return true
+		}
+	}
+	return false
+}
+
+// recordLedger writes one meta-call row. Storage failures are logged
+// at the storage layer; the helper does not propagate them — meta-call
+// accounting MUST NOT break user-visible execution.
+func (h *Helper) recordLedger(ctx context.Context, call MetaCall) {
+	if h.storage == nil {
+		return
+	}
+	_ = h.storage.RecordMetaCall(ctx, call)
+}
+
+// tryParseJSON attempts to decode text as JSON. Returns the parsed
+// value (any) and ok=true on success. ok=false on failure or when
+// format is not "json".
+//
+// Why we accept arbitrary JSON shapes (vs requiring an object): the
+// extract_entities tool returns objects, but classify returns objects
+// with arrays inside. Accepting `any` keeps the helper agnostic to the
+// caller's downstream typing.
+//
+// Tolerance: strips a leading "```json" code fence + matching closing
+// fence so the agent can include surrounding markdown without
+// breaking parse. The stricter retry prompt explicitly asks for no
+// fence; this tolerance is for the first-attempt path.
+func tryParseJSON(text, format string) (any, bool) {
+	if !strings.EqualFold(format, "json") {
+		return nil, false
+	}
+	trimmed := strings.TrimSpace(text)
+	// Strip optional ```json ... ``` fence.
+	if strings.HasPrefix(trimmed, "```") {
+		// Drop opening fence (with or without language tag).
+		if idx := strings.Index(trimmed, "\n"); idx >= 0 {
+			trimmed = trimmed[idx+1:]
+		}
+		// Drop trailing fence.
+		if idx := strings.LastIndex(trimmed, "```"); idx >= 0 {
+			trimmed = trimmed[:idx]
+		}
+		trimmed = strings.TrimSpace(trimmed)
+	}
+	var parsed any
+	if err := json.Unmarshal([]byte(trimmed), &parsed); err != nil {
+		return nil, false
+	}
+	return parsed, true
+}
@@ -0,0 +1,282 @@
+package llmmeta
+
+import (
+	"context"
+	"errors"
+	"strings"
+	"sync"
+	"testing"
+
+	llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+)
+
+// fakeStorage records every MetaCall handed to RecordMetaCall and
+// makes them available to tests via the captured slice.
+type fakeStorage struct {
+	mu    sync.Mutex
+	calls []MetaCall
+	err   error
+}
+
+func (f *fakeStorage) RecordMetaCall(_ context.Context, call MetaCall) error {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.calls = append(f.calls, call)
+	return f.err
+}
+
+func (f *fakeStorage) snapshot() []MetaCall {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	out := make([]MetaCall, len(f.calls))
+	copy(out, f.calls)
+	return out
+}
+
+// TestCall_TierNotAllowed: a tier not in the allowlist returns the
+// rejection without recording a ledger row — the call did not happen.
+func TestCall_TierNotAllowed(t *testing.T) {
+	store := &fakeStorage{}
+	convars := ConvarReaderFunc(func(_ context.Context) []string {
+		return []string{"fast"}
+	})
+	h := New(store, convars)
+
+	res, err := h.Call(context.Background(), CallSpec{
+		Tier:       "thinking",
+		UserPrompt: "hello",
+		ToolName:   "summarize",
+	})
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	if res.Success {
+		t.Errorf("expected Success=false")
+	}
+	if res.ErrorKind != ErrorKindTierNotAllowed {
+		t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindTierNotAllowed)
+	}
+	if len(store.snapshot()) != 0 {
+		t.Errorf("expected NO ledger row for tier_not_allowed, got %d", len(store.snapshot()))
+	}
+}
+
+// TestCall_TierAllowedHappyText: a permitted tier yields a successful
+// text call AND records a ledger row.
+func TestCall_TierAllowedHappyText(t *testing.T) {
+	store := &fakeStorage{}
+	convars := ConvarReaderFunc(func(_ context.Context) []string {
+		return []string{"fast"}
+	})
+	h := New(store, convars)
+	restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
+		return "summary text here", Tokens{InputTokens: 50, OutputTokens: 12}, nil
+	})
+	defer restore()
+
+	res, err := h.Call(context.Background(), CallSpec{
+		Tier:           "fast",
+		UserPrompt:     "summarise the following ...",
+		ToolName:       "summarize",
+		ResponseFormat: "text",
+		RunID:          "run-1",
+		SkillID:        "sk-1",
+	})
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	if !res.Success {
+		t.Errorf("expected Success=true; got ErrorKind=%q", res.ErrorKind)
+	}
+	if res.Text != "summary text here" {
+		t.Errorf("Text = %q, want %q", res.Text, "summary text here")
+	}
+	if res.InputTokens != 50 || res.OutputTokens != 12 {
+		t.Errorf("token counts wrong: in=%d out=%d", res.InputTokens, res.OutputTokens)
+	}
+	if got := len(store.snapshot()); got != 1 {
+		t.Fatalf("expected 1 ledger row, got %d", got)
+	}
+	row := store.snapshot()[0]
+	if !row.Success {
+		t.Errorf("ledger Success = false, want true")
+	}
+	if row.ToolName != "summarize" {
+		t.Errorf("ledger ToolName = %q", row.ToolName)
+	}
+	if row.RunID != "run-1" {
+		t.Errorf("ledger RunID = %q", row.RunID)
+	}
+	if row.InputTokens != 50 || row.OutputTokens != 12 {
+		t.Errorf("ledger token counts wrong: in=%d out=%d",
+			row.InputTokens, row.OutputTokens)
+	}
+}
+
+// TestCall_JSONFirstAttemptParses: JSON-format request, response is
+// valid JSON on first try; result.Parsed populated.
+func TestCall_JSONFirstAttemptParses(t *testing.T) {
+	store := &fakeStorage{}
+	h := New(store, nil)
+	restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
+		return `{"foo":"bar","n":42}`, Tokens{InputTokens: 10, OutputTokens: 5}, nil
+	})
+	defer restore()
+
+	res, _ := h.Call(context.Background(), CallSpec{
+		UserPrompt:           "extract entities",
+		ToolName:             "extract_entities",
+		ResponseFormat:       "json",
+		RetryOnMalformedJSON: true,
+		SkillID:              "sk-2",
+	})
+	if !res.Success || res.ErrorKind != "" {
+		t.Fatalf("expected success, got %+v", res)
+	}
+	m, ok := res.Parsed.(map[string]any)
+	if !ok {
+		t.Fatalf("Parsed not a map: %T %v", res.Parsed, res.Parsed)
+	}
+	if m["foo"] != "bar" {
+		t.Errorf("Parsed[foo] = %v", m["foo"])
+	}
+}
+
+// TestCall_JSONRetryPath: first response is malformed JSON; second
+// response (after stricter prompt) parses cleanly.
+func TestCall_JSONRetryPath(t *testing.T) {
+	store := &fakeStorage{}
+	h := New(store, nil)
+	calls := 0
+	restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, prompt string, _ []llm.Option) (string, Tokens, error) {
+		calls++
+		if calls == 1 {
+			return "Here is your JSON: {oh no I forgot to format it", Tokens{InputTokens: 8, OutputTokens: 12}, nil
+		}
+		// Verify stricter prompt prefix appeared on retry.
+		if !strings.Contains(prompt, "Return ONLY valid JSON") {
+			t.Errorf("retry prompt missing stricter prefix: %q", prompt)
+		}
+		return `{"key":"value"}`, Tokens{InputTokens: 14, OutputTokens: 6}, nil
+	})
+	defer restore()
+
+	res, _ := h.Call(context.Background(), CallSpec{
+		UserPrompt:           "extract",
+		ToolName:             "extract_entities",
+		ResponseFormat:       "json",
+		RetryOnMalformedJSON: true,
+	})
+	if !res.Success || res.ErrorKind != "" {
+		t.Fatalf("expected success, got %+v", res)
+	}
+	if calls != 2 {
+		t.Errorf("expected 2 LLM calls, got %d", calls)
+	}
+	m, _ := res.Parsed.(map[string]any)
+	if m["key"] != "value" {
+		t.Errorf("Parsed = %v", res.Parsed)
+	}
+	// Token counts should reflect both attempts.
+	if res.InputTokens != 22 || res.OutputTokens != 18 {
+		t.Errorf("combined tokens wrong: in=%d out=%d", res.InputTokens, res.OutputTokens)
+	}
+}
+
+// TestCall_JSONRetryFailsTwice: second attempt also fails to parse.
+// Surfaces ErrorKind=malformed_json AND keeps Success=true so the
+// caller can fall back to result.Text.
+func TestCall_JSONRetryFailsTwice(t *testing.T) {
+	store := &fakeStorage{}
+	h := New(store, nil)
+	restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
+		return "still not JSON", Tokens{InputTokens: 10, OutputTokens: 4}, nil
+	})
+	defer restore()
+
+	res, _ := h.Call(context.Background(), CallSpec{
+		UserPrompt:           "extract",
+		ToolName:             "extract_entities",
+		ResponseFormat:       "json",
+		RetryOnMalformedJSON: true,
+	})
+	if !res.Success {
+		t.Errorf("expected Success=true (fall-back-to-text), got Success=false")
+	}
+	if res.ErrorKind != ErrorKindMalformedJSON {
+		t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindMalformedJSON)
+	}
+	if res.Parsed != nil {
+		t.Errorf("Parsed = %v, want nil after failed retry", res.Parsed)
+	}
+	rows := store.snapshot()
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 ledger row, got %d", len(rows))
+	}
+	if !rows[0].Success || rows[0].ErrorKind != ErrorKindMalformedJSON {
+		t.Errorf("ledger row mismatch: %+v", rows[0])
+	}
+}
+
+// TestCall_LLMUnavailable: transport error from the model.Generate
+// call is surfaced as ErrorKind=llm_unavailable AND records a ledger
+// row.
+func TestCall_LLMUnavailable(t *testing.T) {
+	store := &fakeStorage{}
+	h := New(store, nil)
+	restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
+		return "", Tokens{}, errors.New("network error")
+	})
+	defer restore()
+
+	res, _ := h.Call(context.Background(), CallSpec{
+		UserPrompt: "hi",
+		ToolName:   "summarize",
+	})
+	if res.Success {
+		t.Errorf("expected Success=false")
+	}
+	if res.ErrorKind != ErrorKindLLMUnavailable {
+		t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindLLMUnavailable)
+	}
+	rows := store.snapshot()
+	if len(rows) != 1 {
+		t.Fatalf("expected 1 ledger row, got %d", len(rows))
+	}
+}
+
+// TestCall_EmptyUserPromptErrors: programmer-error guard.
+func TestCall_EmptyUserPromptErrors(t *testing.T) {
+	h := New(&fakeStorage{}, nil)
+	_, err := h.Call(context.Background(), CallSpec{ToolName: "summarize"})
+	if err == nil {
+		t.Fatal("expected error for empty user_prompt")
+	}
+}
+
+// TestCall_JSONWithCodeFenceParses: tolerance for the first-attempt
+// response wrapped in a ```json ... ``` fence. The retry path uses a
+// stricter prompt; this test pins the first-attempt tolerance so
+// callers don't waste a round-trip on a benign formatting wrapper.
+func TestCall_JSONWithCodeFenceParses(t *testing.T) {
+	store := &fakeStorage{}
+	h := New(store, nil)
+	restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) {
+		return "```json\n{\"x\":1}\n```", Tokens{InputTokens: 5, OutputTokens: 4}, nil
+	})
+	defer restore()
+
+	res, _ := h.Call(context.Background(), CallSpec{
+		UserPrompt:           "extract",
+		ToolName:             "extract_entities",
+		ResponseFormat:       "json",
+		RetryOnMalformedJSON: true,
+	})
+	if res.ErrorKind != "" {
+		t.Errorf("unexpected ErrorKind %q (fenced JSON should parse on first attempt)", res.ErrorKind)
+	}
+	m, _ := res.Parsed.(map[string]any)
+	if m["x"] != float64(1) {
+		t.Errorf("Parsed[x] = %v, want 1", m["x"])
+	}
+}
@@ -0,0 +1,21 @@
+package llmmeta
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	"gitea.stevedudenhoeffer.com/steve/executus/model"
+)
+
+// TestMain configures a minimal model tier table so the helper's
+// model.ParseModelForContext("fast"/"standard") resolves. The actual LLM call
+// is stubbed per-test via SetCompleteForTest, so these specs are only parsed
+// (anthropic registers with an empty key and errors at call time, not parse).
+func TestMain(m *testing.M) {
+	model.Configure(nil, map[string]string{
+		"fast":     "anthropic/claude-haiku-4-5",
+		"standard": "anthropic/claude-sonnet-4-6",
+	}, time.Minute)
+	os.Exit(m.Run())
+}
@@ -0,0 +1,97 @@
+// Package llms — bench.go: the mort-flavored facade over majordomo's
+// health tracker for the `.failover` Discord commands and the failover
+// web UI.
+//
+// Why a facade (vs exposing health.Tracker directly): the admin surfaces
+// want the historical shape — a benched-only list with a manual/auto
+// flag. majordomo's tracker treats manual benches (Bench) and automatic
+// backoffs identically, so the manual marker is kept mort-side.
+package model
+
+import (
+	"sync"
+	"time"
+)
+
+// BenchedModel is one currently-benched model for admin display.
+type BenchedModel struct {
+	// Model is the "provider/model" target key.
+	Model string
+	// Until is the end of the bench window.
+	Until time.Time
+	// ConsecutiveFails is the failure count since the last success.
+	ConsecutiveFails int
+	// Manual reports the bench was placed by an operator (BenchModel)
+	// rather than the automatic failure threshold.
+	Manual bool
+}
+
+var (
+	manualMu      sync.Mutex
+	manualBenches = map[string]time.Time{}
+)
+
+// ListBenched returns the currently-benched models, manual and automatic,
+// from the live health tracker.
+func ListBenched() []BenchedModel {
+	now := time.Now()
+	pruneManual(now)
+
+	var out []BenchedModel
+	for _, st := range Health().Snapshot() {
+		if !st.Until.After(now) {
+			continue
+		}
+		out = append(out, BenchedModel{
+			Model:            st.Key,
+			Until:            st.Until,
+			ConsecutiveFails: st.ConsecutiveFailures,
+			Manual:           isManual(st.Key, st.Until),
+		})
+	}
+	return out
+}
+
+// BenchModel manually benches a model spec until the given time. The
+// chain executor skips benched targets until the window expires (or
+// UnbenchModel clears it).
+func BenchModel(model string, until time.Time) {
+	Health().Bench(model, until)
+	manualMu.Lock()
+	manualBenches[model] = until
+	manualMu.Unlock()
+}
+
+// UnbenchModel clears the bench on a model. Returns true when the model
+// was actually benched.
+func UnbenchModel(model string) bool {
+	now := time.Now()
+	wasBenched := Health().BackedOffUntil(model).After(now)
+	Health().Unbench(model)
+	manualMu.Lock()
+	delete(manualBenches, model)
+	manualMu.Unlock()
+	return wasBenched
+}
+
+// isManual reports whether the bench window for key matches a manual
+// bench placed via BenchModel. An automatic backoff that outlives the
+// manual window supersedes the marker.
+func isManual(key string, until time.Time) bool {
+	manualMu.Lock()
+	defer manualMu.Unlock()
+	manualUntil, ok := manualBenches[key]
+	return ok && !until.After(manualUntil)
+}
+
+// pruneManual drops expired manual markers so the map can't grow
+// unbounded across a long uptime.
+func pruneManual(now time.Time) {
+	manualMu.Lock()
+	defer manualMu.Unlock()
+	for k, until := range manualBenches {
+		if !until.After(now) {
+			delete(manualBenches, k)
+		}
+	}
+}
@@ -0,0 +1,415 @@
+package model
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log/slog"
+	"runtime/debug"
+	"strings"
+	"time"
+
+	majordomo "gitea.stevedudenhoeffer.com/steve/majordomo"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"github.com/google/uuid"
+)
+
+// CallResult captures the result of a single tool call execution.
+type CallResult struct {
+	Name      string
+	Arguments string
+	Result    string
+	Error     error
+}
+
+// instrumentedModel decorates a parsed model so every successful Generate
+// records token usage to the usage sink automatically. This is the
+// single usage chokepoint: ANY call through a model from
+// ParseModelRequest / ParseModelForContext is accounted, whether it goes
+// through the helpers in this file, the agent loop, or a direct
+// model.Generate at a call site.
+//
+// IMPORTANT: do not call RecordUsage on responses from a parsed model —
+// that would double-count. RecordUsage exists for models obtained outside
+// this package.
+type instrumentedModel struct {
+	inner llm.Model
+}
+
+func (m *instrumentedModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
+	resp, err := m.inner.Generate(ctx, req, opts...)
+	if err == nil && resp != nil {
+		recordUsage(ctx, resp)
+	}
+	return resp, err
+}
+
+func (m *instrumentedModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
+	return m.inner.Stream(ctx, req, opts...)
+}
+
+func (m *instrumentedModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }
+
+// CallAndExecute sends messages to the model with a toolbox, executes any
+// tool calls, and returns the results. It performs a single round of
+// generation + tool execution (no looping) — multi-step loops belong to
+// the agent package.
+func CallAndExecute(ctx context.Context, model llm.Model, systemPrompt string, toolbox *llm.Toolbox, messages []llm.Message, opts ...llm.Option) ([]CallResult, string, error) {
+	req := llm.Request{System: systemPrompt, Messages: messages}
+
+	allOpts := make([]llm.Option, 0, len(opts)+1)
+	if toolbox != nil {
+		allOpts = append(allOpts, llm.WithToolbox(toolbox))
+	}
+	allOpts = append(allOpts, opts...)
+
+	startTime := time.Now()
+	resp, err := model.Generate(ctx, req, allOpts...)
+	if err != nil {
+		recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, nil, nil, startTime, err)
+		return nil, "", fmt.Errorf("completion failed: %w", err)
+	}
+
+	if len(resp.ToolCalls) == 0 || toolbox == nil {
+		recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, resp, nil, startTime, nil)
+		return nil, resp.Text(), nil
+	}
+
+	var results []CallResult
+	for _, call := range resp.ToolCalls {
+		tr := toolbox.Execute(ctx, call)
+		cr := CallResult{
+			Name:      call.Name,
+			Arguments: string(call.Arguments),
+			Result:    tr.Content,
+		}
+		if tr.IsError {
+			cr.Error = errors.New(tr.Content)
+		}
+		results = append(results, cr)
+	}
+
+	recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, resp, results, startTime, nil)
+
+	return results, resp.Text(), nil
+}
+
+// GenerateWith sends messages to the model with an optional system prompt and
+// returns structured output parsed into T. T must be a struct. Uses
+// majordomo's native structured output (response schema derived from T).
+func GenerateWith[T any](ctx context.Context, model llm.Model, systemPrompt string, messages []llm.Message, opts ...llm.Option) (T, error) {
+	req := llm.Request{System: systemPrompt, Messages: messages}
+
+	startTime := time.Now()
+
+	// Capture the raw response so the trace span carries usage and the
+	// concrete serving model even though majordomo.Generate only returns T.
+	capture := &captureModel{inner: model}
+	result, err := majordomo.Generate[T](ctx, capture, req, opts...)
+
+	resolvedModel := resolvedModelName(ctx, capture.resp)
+
+	if tracingEnabled(ctx) {
+		span := Span{
+			SpanID:       uuid.New().String(),
+			TraceID:      traceIDFromContext(ctx),
+			Model:        resolvedModel,
+			SystemPrompt: systemPrompt,
+			Messages:     marshalMessages(messages),
+			DurationMs:   time.Since(startTime).Milliseconds(),
+			StartedAt:    startTime,
+			CompletedAt:  time.Now(),
+			CreatedAt:    time.Now(),
+		}
+		if capture.resp != nil {
+			span.InputTokens = capture.resp.Usage.InputTokens
+			span.OutputTokens = capture.resp.Usage.OutputTokens
+		}
+		if err != nil {
+			span.Error = err.Error()
+			// Structured-output failure: log loudly so operators can chase
+			// down a regression (e.g. a model returning prose or fenced
+			// JSON the decoder rejects) from the trace span alone. The
+			// error string includes the failing field path on decode
+			// errors.
+			if isStructuredOutputParseError(err) {
+				slog.Warn("llms.GenerateWith: structured-output parse failure",
+					"model", resolvedModel,
+					"span_id", span.SpanID,
+					"trace_id", span.TraceID,
+					"err", err.Error(),
+				)
+			}
+		} else {
+			b, _ := json.Marshal(result)
+			span.ResponseText = string(b)
+		}
+		traceSink.WriteSpan(span)
+	} else if err != nil && isStructuredOutputParseError(err) {
+		// Tracing disabled: slog.Warn is the only breadcrumb operators get.
+		slog.Warn("llms.GenerateWith: structured-output parse failure (no trace span)",
+			"model", resolvedModel,
+			"err", err.Error(),
+		)
+	}
+
+	return result, err
+}
+
+// captureModel records the last successful response so wrappers that
+// only see the decoded result (majordomo.Generate) can still attribute
+// usage and tracing.
+type captureModel struct {
+	inner llm.Model
+	resp  *llm.Response
+}
+
+func (m *captureModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
+	resp, err := m.inner.Generate(ctx, req, opts...)
+	if err == nil {
+		m.resp = resp
+	}
+	return resp, err
+}
+
+func (m *captureModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
+	return m.inner.Stream(ctx, req, opts...)
+}
+
+func (m *captureModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }
+
+// isStructuredOutputParseError reports whether err looks like a
+// structured-output failure from majordomo.Generate — either the decode
+// path ("decode structured response") or the empty-response path
+// ("structured response from ... is empty"). Used to gate the loud
+// slog.Warn so transport errors don't get tagged as parse failures.
+func isStructuredOutputParseError(err error) bool {
+	if err == nil {
+		return false
+	}
+	s := err.Error()
+	return strings.Contains(s, "decode structured response") ||
+		strings.Contains(s, "structured response from")
+}
+
+// SimpleCall sends a single user message to the model with an optional system
+// prompt and returns the text response. No tools involved.
+func SimpleCall(ctx context.Context, model llm.Model, systemPrompt string, userMessage string, opts ...llm.Option) (string, error) {
+	msgs := []llm.Message{llm.UserText(userMessage)}
+
+	startTime := time.Now()
+	resp, err := model.Generate(ctx, llm.Request{System: systemPrompt, Messages: msgs}, opts...)
+	if err != nil {
+		recordSpanFromWrapper(ctx, systemPrompt, msgs, nil, nil, nil, startTime, err)
+		return "", fmt.Errorf("completion failed: %w", err)
+	}
+
+	recordSpanFromWrapper(ctx, systemPrompt, msgs, nil, resp, nil, startTime, nil)
+
+	return resp.Text(), nil
+}
+
+// RecordUsage records LLM token usage from a successful Generate response.
+//
+// ONLY call this for models obtained outside this package: models returned
+// by ParseModelRequest / ParseModelForContext record usage automatically on
+// every Generate, and calling RecordUsage on their responses double-counts.
+func RecordUsage(ctx context.Context, resp llm.Response) {
+	recordUsage(ctx, &resp)
+}
+
+// RecordSpan records a trace span for a direct model.Generate() call.
+// Call this from modules that invoke model.Generate() directly when they
+// want the call traced (usage is already recorded automatically for
+// parsed models).
+func RecordSpan(ctx context.Context, systemPrompt string, messages []llm.Message, toolbox *llm.Toolbox, resp *llm.Response, callResults []CallResult, startTime time.Time, callErr error) {
+	recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, resp, callResults, startTime, callErr)
+}
+
+// recordUsage records token usage for one response. The model is
+// attributed from the response itself when possible (resp.Model names
+// the chain element that actually served the request — more precise than
+// the requested spec), falling back to the context attribution set by
+// ParseModelForContext.
+func recordUsage(ctx context.Context, resp *llm.Response) {
+	if usageSink == nil || resp == nil {
+		return
+	}
+	u := resp.Usage
+	if u.InputTokens == 0 && u.OutputTokens == 0 {
+		return
+	}
+	model := resolvedModelName(ctx, resp)
+	if model == "unknown" || model == "" {
+		tool := toolFromContext(ctx)
+		if tool == "unknown" {
+			slog.Warn("model usage: recording with both unknown model and tool",
+				"user", userFromContext(ctx), "stack", string(debug.Stack()))
+		} else {
+			slog.Warn("model usage: recording with unknown model — caller should set model.WithModel or use model.ParseModelForContext",
+				"tool", tool, "user", userFromContext(ctx))
+		}
+	}
+	usageSink.Record(ctx, model, u.InputTokens, u.OutputTokens, u.CacheReadTokens, u.CacheWriteTokens)
+}
+
+// resolvedModelName picks the usage/trace attribution name: the serving
+// model from the response when present ("provider/model" → "model"),
+// else the context's requested model resolved through the tier table.
+func resolvedModelName(ctx context.Context, resp *llm.Response) string {
+	if resp != nil && resp.Model != "" {
+		name := resp.Model
+		if idx := strings.Index(name, "/"); idx >= 0 {
+			name = name[idx+1:]
+		}
+		return name
+	}
+	return ResolveModelName(modelFromContext(ctx))
+}
+
+// tracingEnabled returns true if there's an active trace and tracing is enabled.
+func tracingEnabled(ctx context.Context) bool {
+	if traceSink == nil {
+		return false
+	}
+	return traceIDFromContext(ctx) != ""
+}
+
+// recordSpanFromWrapper records a trace span if tracing is active.
+func recordSpanFromWrapper(ctx context.Context, systemPrompt string, messages []llm.Message, toolbox *llm.Toolbox, resp *llm.Response, callResults []CallResult, startTime time.Time, callErr error) {
+	if !tracingEnabled(ctx) {
+		return
+	}
+
+	now := time.Now()
+
+	span := Span{
+		SpanID:          uuid.New().String(),
+		TraceID:         traceIDFromContext(ctx),
+		Model:           resolvedModelName(ctx, resp),
+		SystemPrompt:    systemPrompt,
+		Messages:        marshalMessages(messages),
+		ToolDefinitions: marshalToolDefs(toolbox),
+		DurationMs:      now.Sub(startTime).Milliseconds(),
+		StartedAt:       startTime,
+		CompletedAt:     now,
+		CreatedAt:       now,
+	}
+
+	if callErr != nil {
+		span.Error = callErr.Error()
+	}
+
+	if resp != nil {
+		span.ResponseText = resp.Text()
+		span.InputTokens = resp.Usage.InputTokens
+		span.OutputTokens = resp.Usage.OutputTokens
+		if len(resp.ToolCalls) > 0 {
+			span.ResponseToolCalls = marshalToolCalls(resp.ToolCalls)
+		}
+	}
+
+	if len(callResults) > 0 {
+		span.ToolResults = marshalCallResults(callResults)
+	}
+
+	traceSink.WriteSpan(span)
+}
+
+// --- Serialization helpers ---
+
+type jsonMessage struct {
+	Role       string `json:"role"`
+	Text       string `json:"text,omitempty"`
+	ToolCallID string `json:"tool_call_id,omitempty"`
+	ImageCount int    `json:"image_count,omitempty"`
+}
+
+func marshalMessages(msgs []llm.Message) string {
+	out := make([]jsonMessage, 0, len(msgs))
+	for _, m := range msgs {
+		jm := jsonMessage{
+			Role: string(m.Role),
+			Text: m.Text(),
+		}
+		for _, p := range m.Parts {
+			if _, ok := p.(llm.ImagePart); ok {
+				jm.ImageCount++
+			}
+		}
+		if len(m.ToolResults) > 0 {
+			jm.ToolCallID = m.ToolResults[0].ID
+		}
+		out = append(out, jm)
+	}
+	b, _ := json.Marshal(out)
+	return string(b)
+}
+
+type jsonToolCall struct {
+	ID        string `json:"id"`
+	Name      string `json:"name"`
+	Arguments string `json:"arguments"`
+}
+
+func marshalToolCalls(calls []llm.ToolCall) string {
+	out := make([]jsonToolCall, 0, len(calls))
+	for _, c := range calls {
+		out = append(out, jsonToolCall{
+			ID:        c.ID,
+			Name:      c.Name,
+			Arguments: string(c.Arguments),
+		})
+	}
+	b, _ := json.Marshal(out)
+	return string(b)
+}
+
+type jsonCallResult struct {
+	Name      string `json:"name"`
+	Arguments string `json:"arguments"`
+	Result    string `json:"result"`
+	Error     string `json:"error,omitempty"`
+}
+
+func marshalCallResults(results []CallResult) string {
+	out := make([]jsonCallResult, 0, len(results))
+	for _, r := range results {
+		jr := jsonCallResult{
+			Name:      r.Name,
+			Arguments: r.Arguments,
+			Result:    r.Result,
+		}
+		if r.Error != nil {
+			jr.Error = r.Error.Error()
+		}
+		out = append(out, jr)
+	}
+	b, _ := json.Marshal(out)
+	return string(b)
+}
+
+type jsonToolDef struct {
+	Name        string `json:"name"`
+	Description string `json:"description"`
+}
+
+func marshalToolDefs(tb *llm.Toolbox) string {
+	if tb == nil {
+		return ""
+	}
+	tools := tb.Tools()
+	if len(tools) == 0 {
+		return ""
+	}
+	out := make([]jsonToolDef, 0, len(tools))
+	for _, t := range tools {
+		out = append(out, jsonToolDef{
+			Name:        t.Name,
+			Description: t.Description,
+		})
+	}
+	b, _ := json.Marshal(out)
+	return string(b)
+}
@@ -0,0 +1,453 @@
+// V15.4 — Ollama Cloud dynamic context-length sync.
+//
+// Why: the static map in context_limits.go has to be hand-maintained
+// for every new Ollama Cloud model. Cloud ships new models monthly,
+// and a missing entry silently disables compaction for runs on that
+// model (compactionThresholdForModel returns 0 on MaxContextTokens
+// miss). Dynamic sync removes the maintenance burden and means new
+// cloud models work out-of-the-box.
+//
+// How: at boot, mort kicks off a CloudOllamaLimitCache.RefreshAll in a
+// background goroutine. RefreshAll calls /api/tags to list every
+// available cloud model, then concurrently calls /api/show for each
+// to extract `<family>.context_length` from the response's model_info
+// map. The cache is consulted by the executor's
+// compactionThresholdForModel via the cache-aware
+// MaxContextTokensWithCache helper.
+//
+// Periodic refresh: a daily ticker re-runs RefreshAll so newly
+// released models surface without a mort restart. The interval is
+// intentionally not configurable — cloud model context lengths don't
+// change for a given tag (only the tag pointer can move, e.g. :cloud
+// → larger model), so daily is conservative.
+
+package model
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"log/slog"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+)
+
+// defaultCloudEndpoint is the public Ollama Cloud base URL. Override
+// in tests via NewCloudOllamaLimitCache's endpoint arg.
+const defaultCloudEndpoint = "https://ollama.com"
+
+// CloudOllamaLimitCache holds context-length values for Ollama Cloud
+// models, populated dynamically via /api/tags + /api/show. Construct
+// with NewCloudOllamaLimitCache. Safe for concurrent use.
+//
+// Empty when OLLAMA_API_KEY is unset — Refresh returns a clear error
+// and the cache stays empty. Lookups return (0, false) and callers
+// fall back to the static map / disabled compaction.
+type CloudOllamaLimitCache struct {
+	mu       sync.RWMutex
+	limit    map[string]int
+	negative map[string]time.Time // model → fetch-failure time (for TTL)
+
+	endpoint   string
+	apiKey     string
+	httpClient *http.Client
+
+	// refreshConcurrency caps the number of concurrent /api/show calls
+	// during RefreshAll. Default 8 — enough to finish a ~50-model
+	// catalog in well under a minute without hammering Cloud.
+	refreshConcurrency int
+
+	// negativeTTL is how long a /api/show miss is cached before we
+	// retry. Prevents hammering Cloud on a typo or recently-removed
+	// model. Default 10 minutes.
+	negativeTTL time.Duration
+}
+
+// NewCloudOllamaLimitCache constructs a fresh cache. apiKey can be
+// empty — RefreshAll then returns an error and the cache stays empty.
+// endpoint defaults to https://ollama.com when empty. httpClient
+// defaults to a 15s-timeout client.
+func NewCloudOllamaLimitCache(endpoint, apiKey string, httpClient *http.Client) *CloudOllamaLimitCache {
+	if strings.TrimSpace(endpoint) == "" {
+		endpoint = defaultCloudEndpoint
+	}
+	endpoint = strings.TrimRight(endpoint, "/")
+	if httpClient == nil {
+		httpClient = &http.Client{Timeout: 15 * time.Second}
+	}
+	return &CloudOllamaLimitCache{
+		limit:              make(map[string]int),
+		negative:           make(map[string]time.Time),
+		endpoint:           endpoint,
+		apiKey:             apiKey,
+		httpClient:         httpClient,
+		refreshConcurrency: 8,
+		negativeTTL:        10 * time.Minute,
+	}
+}
+
+// SetNegativeTTL overrides the negative-cache lifetime. Tests use this
+// to control retry behaviour without sleeping.
+func (c *CloudOllamaLimitCache) SetNegativeTTL(d time.Duration) {
+	if c == nil || d < 0 {
+		return
+	}
+	c.mu.Lock()
+	c.negativeTTL = d
+	c.mu.Unlock()
+}
+
+// Lookup returns the cached context length for an Ollama Cloud model
+// name (e.g. "qwen3.5:cloud", "qwen3-coder:480b"). Returns (0, false)
+// on miss. Lookup never makes HTTP calls — it's the hot path consulted
+// by the executor before every run.
+//
+// modelName accepts either the bare model:tag form or the prefixed
+// "ollama-cloud/model:tag" form; the prefix is stripped.
+func (c *CloudOllamaLimitCache) Lookup(modelName string) (int, bool) {
+	if c == nil {
+		return 0, false
+	}
+	key := stripCloudPrefix(modelName)
+	if key == "" {
+		return 0, false
+	}
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	v, ok := c.limit[key]
+	return v, ok
+}
+
+// Size returns the number of cached entries. Useful for logging /
+// health checks.
+func (c *CloudOllamaLimitCache) Size() int {
+	if c == nil {
+		return 0
+	}
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	return len(c.limit)
+}
+
+// LookupOrFetch returns the cached context length OR, on miss, makes a
+// single /api/show call to populate the cache. Negative results
+// (model not found, /api/show returns no context_length) are cached
+// for negativeTTL to prevent hammering Cloud on a typo. Returns
+// (0, false) when the model is genuinely unknown and (size, true) on
+// any successful resolve.
+//
+// Why this exists: Ollama Cloud's /api/tags lists canonical model
+// names only (e.g. "qwen3.5:397b") but accepts aliases on /api/show
+// (e.g. "qwen3.5:cloud" → same 397b model). The boot-time RefreshAll
+// only sees the canonical names, so common aliases miss the cache.
+// LookupOrFetch fills the gap.
+//
+// The cache is therefore self-healing: any unknown model gets one
+// live /api/show call, the result lands in the cache, and subsequent
+// runs hit immediately. Periodic RefreshAll overwrites everything
+// with the canonical-name results but additionally-fetched aliases
+// linger as positive entries.
+func (c *CloudOllamaLimitCache) LookupOrFetch(ctx context.Context, modelName string) (int, bool) {
+	if c == nil {
+		return 0, false
+	}
+	key := stripCloudPrefix(modelName)
+	if key == "" {
+		return 0, false
+	}
+	// Fast path: positive hit.
+	c.mu.RLock()
+	if v, ok := c.limit[key]; ok {
+		c.mu.RUnlock()
+		return v, true
+	}
+	// Negative cache check.
+	if t, ok := c.negative[key]; ok && time.Since(t) < c.negativeTTL {
+		c.mu.RUnlock()
+		return 0, false
+	}
+	c.mu.RUnlock()
+	// No API key configured → can't fetch. Don't write a negative
+	// entry (when the key gets configured later we want the next call
+	// to re-try immediately).
+	if strings.TrimSpace(c.apiKey) == "" {
+		return 0, false
+	}
+	// Slow path: live /api/show.
+	n, err := c.fetchContextLength(ctx, key)
+	if err != nil || n <= 0 {
+		slog.Debug("cloud limit cache: lazy fetch miss",
+			"model", key, "err", err)
+		c.mu.Lock()
+		c.negative[key] = time.Now()
+		c.mu.Unlock()
+		return 0, false
+	}
+	c.set(key, n)
+	slog.Info("cloud limit cache: lazy fetch hit", "model", key, "context_length", n)
+	return n, true
+}
+
+// set stores a context length. n <= 0 is a no-op.
+func (c *CloudOllamaLimitCache) set(modelName string, n int) {
+	if c == nil || n <= 0 {
+		return
+	}
+	key := stripCloudPrefix(modelName)
+	if key == "" {
+		return
+	}
+	c.mu.Lock()
+	c.limit[key] = n
+	c.mu.Unlock()
+}
+
+// RefreshAll queries /api/tags then concurrently calls /api/show for
+// every listed model, populating the cache. Returns the number of
+// models successfully cached and the first error encountered (a
+// /api/tags failure aborts; individual /api/show failures are logged
+// but don't abort the whole refresh).
+//
+// Safe to call repeatedly. Cache entries are overwritten with the
+// fresh values; entries for models that have been removed from Cloud
+// are NOT pruned (cheap to keep; pruning risks dropping an entry just
+// before a run that needs it).
+func (c *CloudOllamaLimitCache) RefreshAll(ctx context.Context) (int, error) {
+	if c == nil {
+		return 0, fmt.Errorf("cloud limit cache: nil receiver")
+	}
+	if strings.TrimSpace(c.apiKey) == "" {
+		return 0, fmt.Errorf("cloud limit cache: OLLAMA_API_KEY unset")
+	}
+	tags, err := c.fetchTags(ctx)
+	if err != nil {
+		return 0, fmt.Errorf("cloud limit cache: /api/tags: %w", err)
+	}
+
+	concurrency := c.refreshConcurrency
+	if concurrency <= 0 {
+		concurrency = 8
+	}
+	sem := make(chan struct{}, concurrency)
+	var wg sync.WaitGroup
+	var (
+		mu      sync.Mutex
+		success int
+	)
+	for _, name := range tags {
+		name := name
+		wg.Add(1)
+		sem <- struct{}{}
+		go func() {
+			defer wg.Done()
+			defer func() { <-sem }()
+			ctxLen, ferr := c.fetchContextLength(ctx, name)
+			if ferr != nil {
+				slog.Debug("cloud limit cache: /api/show miss",
+					"model", name, "err", ferr)
+				return
+			}
+			c.set(name, ctxLen)
+			mu.Lock()
+			success++
+			mu.Unlock()
+		}()
+	}
+	wg.Wait()
+	slog.Info("cloud limit cache: refresh complete",
+		"models_total", len(tags), "cached", success)
+	return success, nil
+}
+
+// StartPeriodicRefresh runs RefreshAll once immediately, then on every
+// interval tick. Cancellation via ctx stops the loop. Logs each
+// outcome; never returns an error to the caller (this is a background
+// task — failures are warnings, not show-stoppers).
+//
+// Typical usage: a goroutine spawned at mort boot.
+//
+//	go cache.StartPeriodicRefresh(ctx, 24*time.Hour)
+func (c *CloudOllamaLimitCache) StartPeriodicRefresh(ctx context.Context, interval time.Duration) {
+	if c == nil {
+		return
+	}
+	if interval <= 0 {
+		interval = 24 * time.Hour
+	}
+	doOne := func() {
+		n, err := c.RefreshAll(ctx)
+		if err != nil {
+			slog.Warn("cloud limit cache: refresh failed",
+				"err", err, "cached_size", c.Size())
+			return
+		}
+		slog.Info("cloud limit cache: refreshed",
+			"newly_cached_or_updated", n, "cached_size", c.Size())
+	}
+	doOne()
+	t := time.NewTicker(interval)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			doOne()
+		}
+	}
+}
+
+// fetchTags calls GET /api/tags and returns the model names.
+func (c *CloudOllamaLimitCache) fetchTags(ctx context.Context) ([]string, error) {
+	url := c.endpoint + "/api/tags"
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
+	if err != nil {
+		return nil, err
+	}
+	c.applyAuth(req)
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, err
+	}
+	if resp.StatusCode/100 != 2 {
+		return nil, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(body, 400))
+	}
+	var parsed struct {
+		Models []struct {
+			Name  string `json:"name"`
+			Model string `json:"model"`
+		} `json:"models"`
+	}
+	if err := json.Unmarshal(body, &parsed); err != nil {
+		return nil, fmt.Errorf("parse /api/tags: %w", err)
+	}
+	out := make([]string, 0, len(parsed.Models))
+	for _, m := range parsed.Models {
+		name := m.Name
+		if name == "" {
+			name = m.Model
+		}
+		if name == "" {
+			continue
+		}
+		out = append(out, name)
+	}
+	return out, nil
+}
+
+// fetchContextLength calls POST /api/show for a model and extracts
+// the largest *.context_length value from model_info. Returns the
+// length and nil on success; (0, err) on any failure.
+//
+// Why "largest" rather than family-keyed: the family field in the
+// /api/show response is sometimes empty or doesn't match the
+// model_info key prefix exactly (Ollama Cloud returns the
+// architecture as the prefix, which usually but not always matches
+// `family`). Scanning for any `*.context_length` is robust.
+func (c *CloudOllamaLimitCache) fetchContextLength(ctx context.Context, modelName string) (int, error) {
+	url := c.endpoint + "/api/show"
+	body, _ := json.Marshal(map[string]string{"name": modelName})
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
+	if err != nil {
+		return 0, err
+	}
+	req.Header.Set("Content-Type", "application/json")
+	c.applyAuth(req)
+	resp, err := c.httpClient.Do(req)
+	if err != nil {
+		return 0, err
+	}
+	defer resp.Body.Close()
+	respBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return 0, err
+	}
+	if resp.StatusCode/100 != 2 {
+		return 0, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(respBody, 400))
+	}
+	n, err := parseContextLengthJSON(respBody)
+	if err != nil {
+		return 0, err
+	}
+	return n, nil
+}
+
+// parseContextLengthJSON extracts the largest `*.context_length` int
+// from an /api/show response body. Exported-ish (lowercase but tested
+// in the same package) so the unit test can exercise it without
+// spinning up an httptest server.
+func parseContextLengthJSON(body []byte) (int, error) {
+	var parsed struct {
+		ModelInfo map[string]any `json:"model_info"`
+	}
+	if err := json.Unmarshal(body, &parsed); err != nil {
+		return 0, fmt.Errorf("parse: %w", err)
+	}
+	best := 0
+	for k, v := range parsed.ModelInfo {
+		if !strings.HasSuffix(k, ".context_length") {
+			continue
+		}
+		n := toInt(v)
+		if n > best {
+			best = n
+		}
+	}
+	if best <= 0 {
+		return 0, fmt.Errorf("no context_length in model_info")
+	}
+	return best, nil
+}
+
+// toInt coerces a JSON-decoded value to int. Handles float64 (the
+// json default) and json.Number; returns 0 for anything else.
+func toInt(v any) int {
+	switch x := v.(type) {
+	case float64:
+		return int(x)
+	case int:
+		return x
+	case int64:
+		return int(x)
+	case json.Number:
+		if n, err := x.Int64(); err == nil {
+			return int(n)
+		}
+	}
+	return 0
+}
+
+// applyAuth sets the Bearer token when an API key is configured.
+func (c *CloudOllamaLimitCache) applyAuth(req *http.Request) {
+	if strings.TrimSpace(c.apiKey) == "" {
+		return
+	}
+	req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(c.apiKey))
+}
+
+// stripCloudPrefix strips an "ollama-cloud/" prefix (and surrounding
+// whitespace). Returns the bare model:tag form.
+func stripCloudPrefix(s string) string {
+	s = strings.TrimSpace(s)
+	if strings.HasPrefix(s, "ollama-cloud/") {
+		s = s[len("ollama-cloud/"):]
+	}
+	return s
+}
+
+// truncate caps a byte slice for error messages.
+func truncate(b []byte, n int) string {
+	if len(b) <= n {
+		return string(b)
+	}
+	return string(b[:n]) + "...(truncated)"
+}
@@ -0,0 +1,224 @@
+// V15.2 — per-model context-window limits.
+//
+// Why: agents need to know when they're about to blow the model's
+// max-input cap so they can compact stale tool results out of the
+// message history. Pre-15.2 the agent loop had no awareness; a long
+// research run that accumulated dozens of HTTP tool results would
+// hit Ollama's HTTP 400 "prompt is too long" or Anthropic's similar
+// error mid-run with no graceful degradation.
+//
+// Coverage:
+//   - Anthropic Claude 4.x (200K default; 1M when the model ID
+//     includes the "[1m]" suffix per llms.tier reload conventions)
+//   - OpenAI GPT-4.x / o-series (128K)
+//   - Gemini 2.x (1M-2M, model-specific)
+//   - Ollama Cloud (model-specific; hardcoded per known model)
+//   - Local Ollama: queries `/api/show` once at first use, caches
+//
+// Returns (0, false) for unknown models — callers should treat
+// "unknown" as "don't budget" (the agent's existing iteration cap +
+// timeout are the fallback safety nets).
+
+package model
+
+import (
+	"context"
+	"strings"
+	"sync"
+)
+
+// MaxContextTokens returns the model's max INPUT context-window size
+// in tokens. Output / response tokens are NOT included — most models
+// share input + output budget but cap them separately, and the practical
+// concern is "how big can my prompt get before the model rejects".
+//
+// modelID accepts both the bare model name (`claude-sonnet-4-6`) and
+// the prefixed form (`anthropic/claude-sonnet-4-6` or
+// `ollama-cloud/qwen3-coder:480b`). The prefix is stripped before lookup.
+//
+// Returns (limit, true) on a known model; (0, false) otherwise.
+//
+// This function is pure (no I/O). For Ollama Cloud models that aren't
+// in the static map, use MaxContextTokensWithCache which consults a
+// CloudOllamaLimitCache populated at boot from /api/tags + /api/show.
+func MaxContextTokens(modelID string) (int, bool) {
+	id := normalizeModelID(modelID)
+	if v, ok := staticContextLimits[id]; ok {
+		return v, true
+	}
+	// Anthropic 1M-context variant marker. Mort's llms tier system
+	// uses a `[1m]` suffix on the model ID (e.g.
+	// `claude-opus-4-7[1m]`) to opt into Anthropic's 1M beta context.
+	if strings.HasSuffix(id, "[1m]") {
+		return 1_000_000, true
+	}
+	// Local-ollama dynamic lookup is wired separately so it can
+	// query the daemon's /api/show endpoint. The static map covers
+	// known cloud models.
+	return 0, false
+}
+
+// MaxContextTokensWithCache is the cache-aware variant of
+// MaxContextTokens. It tries the static map first; on miss, if the
+// model is an Ollama Cloud spec (the `ollama-cloud/` prefix), it
+// consults the supplied CloudOllamaLimitCache. Pass nil cache for
+// static-only behaviour (equivalent to MaxContextTokens).
+//
+// This function never makes HTTP calls — the cache must be
+// pre-populated (typically via cache.RefreshAll at boot). Callers in
+// the hot path can rely on a single map lookup per call. Prefer
+// MaxContextTokensResolving when a context is available — it makes a
+// single /api/show call to fill the cache on miss, which is essential
+// for Cloud aliases that /api/tags doesn't enumerate (e.g. :cloud).
+func MaxContextTokensWithCache(modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
+	if v, ok := MaxContextTokens(modelID); ok {
+		return v, true
+	}
+	if cloud == nil {
+		return 0, false
+	}
+	// Only ollama-cloud/* models are eligible for the cache.
+	id := strings.TrimSpace(modelID)
+	if !strings.HasPrefix(id, "ollama-cloud/") {
+		// Also allow bare model:tag form when the caller has already
+		// stripped the prefix (some test paths).
+		if strings.Contains(id, "/") {
+			return 0, false
+		}
+	}
+	return cloud.Lookup(id)
+}
+
+// MaxContextTokensResolving is the cache-aware variant that ALSO
+// performs a live /api/show fetch on cache miss (with negative caching
+// to prevent thrash). Use this in run-setup paths where one HTTP call
+// per unseen model is acceptable — typically the skill executor's
+// compaction threshold computation. The fetched result is cached for
+// future calls, so subsequent runs hit the in-memory map.
+//
+// Falls back to the static-only path when the model isn't an
+// ollama-cloud/* spec or cache is nil. ctx cancellation aborts the
+// fetch and returns (0, false) without writing a negative entry.
+func MaxContextTokensResolving(ctx context.Context, modelID string, cloud *CloudOllamaLimitCache) (int, bool) {
+	if v, ok := MaxContextTokens(modelID); ok {
+		return v, true
+	}
+	if cloud == nil {
+		return 0, false
+	}
+	id := strings.TrimSpace(modelID)
+	if !strings.HasPrefix(id, "ollama-cloud/") {
+		if strings.Contains(id, "/") {
+			return 0, false
+		}
+	}
+	return cloud.LookupOrFetch(ctx, id)
+}
+
+// normalizeModelID strips provider prefix and reasoning suffix so a
+// lookup keyed on the base name works regardless of caller form.
+//
+// Examples:
+//   - "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6"
+//   - "ollama-cloud/qwen3-coder:480b" → "qwen3-coder:480b"
+//   - "claude-opus-4-7:high" → "claude-opus-4-7"
+func normalizeModelID(id string) string {
+	id = strings.TrimSpace(id)
+	if idx := strings.Index(id, "/"); idx >= 0 {
+		id = id[idx+1:]
+	}
+	// Strip :low/:medium/:high reasoning effort suffix used by some
+	// OpenAI / Anthropic clients.
+	for _, suffix := range []string{":low", ":medium", ":high"} {
+		if strings.HasSuffix(id, suffix) {
+			id = id[:len(id)-len(suffix)]
+			break
+		}
+	}
+	return id
+}
+
+// staticContextLimits is the source of truth for known cloud models.
+// Add new entries when adding a model to the llms tier system.
+//
+// CRITICAL: keep these in sync with the actual provider docs. A wrong
+// number here causes EITHER premature compaction (too low, degrades
+// agent quality unnecessarily) OR HTTP 400 mid-run (too high). The
+// 410K-token failure on `qwen3-coder:480b` is the kind of bug a
+// mistyped value would reintroduce.
+var staticContextLimits = map[string]int{
+	// Anthropic Claude 4.x — default 200K input. 1M variant via
+	// `[1m]` suffix handled in MaxContextTokens above.
+	"claude-opus-4-7":           200_000,
+	"claude-opus-4-6":           200_000,
+	"claude-opus-4-5":           200_000,
+	"claude-sonnet-4-6":         200_000,
+	"claude-sonnet-4-5":         200_000,
+	"claude-haiku-4-5":          200_000,
+	"claude-haiku-4-5-20251001": 200_000,
+
+	// OpenAI GPT-4.x / o-series — 128K input.
+	"gpt-4o":      128_000,
+	"gpt-4o-mini": 128_000,
+	"gpt-4-turbo": 128_000,
+	"o1":          200_000,
+	"o1-mini":     128_000,
+	"o3-mini":     200_000,
+	"gpt-5":       400_000,
+	"gpt-5-mini":  400_000,
+
+	// Gemini — varies dramatically by model.
+	"gemini-2.5-pro":        2_000_000,
+	"gemini-2.5-flash":      1_000_000,
+	"gemini-2.5-flash-lite": 1_000_000,
+	"gemini-1.5-pro":        2_000_000,
+	"gemini-1.5-flash":      1_000_000,
+
+	// Ollama Cloud (turbo). Limits per https://ollama.com/cloud/models
+	// — verified against the Ollama API show output for each model.
+	// Update when Ollama publishes new models or extends contexts.
+	"qwen3-coder:480b":   262_144, // 262K — matches the v15.2 trace
+	"qwen3:235b":         262_144,
+	"qwen3:32b":          131_072,
+	"qwen2.5:72b":        131_072,
+	"gpt-oss:120b":       131_072,
+	"gpt-oss:20b":        131_072,
+	"deepseek-v3.1:671b": 131_072,
+	"glm-4.6:355b":       131_072,
+	"kimi-k2:1t":         262_144,
+	"llama4:scout":       10_000_000, // Llama 4 Scout claims 10M
+	"llama4:maverick":    1_000_000,
+}
+
+// LocalOllamaLimitCache holds the resolved /api/show context_length per
+// local-ollama model. Populated on first lookup; never invalidated
+// (changing num_ctx requires an ollama restart anyway). Process-wide,
+// no per-tenant scoping needed.
+type LocalOllamaLimitCache struct {
+	mu    sync.RWMutex
+	limit map[string]int
+}
+
+// NewLocalOllamaLimitCache constructs a fresh cache.
+func NewLocalOllamaLimitCache() *LocalOllamaLimitCache {
+	return &LocalOllamaLimitCache{limit: make(map[string]int)}
+}
+
+// Get returns the cached limit or (0, false) when unseen. The caller
+// is expected to follow up with a lookup against the live daemon.
+func (c *LocalOllamaLimitCache) Get(model string) (int, bool) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	v, ok := c.limit[model]
+	return v, ok
+}
+
+// Set records a resolved limit. Idempotent; no-op when value is <= 0.
+func (c *LocalOllamaLimitCache) Set(model string, n int) {
+	if n <= 0 {
+		return
+	}
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.limit[model] = n
+}
@@ -0,0 +1,97 @@
+package model
+
+import (
+	"testing"
+	"time"
+)
+
+// mapSource is a tiny config.Source for tests: a key->value map, defaults
+// returned for misses.
+type mapSource map[string]string
+
+func (m mapSource) String(k, d string) string {
+	if v, ok := m[k]; ok {
+		return v
+	}
+	return d
+}
+func (m mapSource) Int(string, int) int           { panic("unused") }
+func (m mapSource) Float(string, float64) float64 { panic("unused") }
+func (m mapSource) Bool(string, bool) bool         { panic("unused") }
+
+// TestConfigureTierResolution covers the convar->config.Source inversion: the
+// host supplies a tier table (names + fallbacks) and a live config source; the
+// config value overrides the fallback, and an absent key falls back.
+func TestConfigureTierResolution(t *testing.T) {
+	Configure(
+		mapSource{"model.tier.fast": "anthropic/claude-haiku-4-5"},
+		map[string]string{"fast": "openai/gpt-4o-mini", "thinking": "anthropic/claude-opus-4-8"},
+		time.Minute,
+	)
+	defer Configure(nil, nil, 0) // reset package global
+
+	if !IsTierName("fast") || !IsTierName("thinking") {
+		t.Fatal("configured tiers should be registered")
+	}
+	if IsTierName("nope") {
+		t.Fatal("unknown tier must not report as a tier")
+	}
+	if names := TierNames(); len(names) != 2 || names[0] != "fast" || names[1] != "thinking" {
+		t.Fatalf("TierNames = %v, want sorted [fast thinking]", names)
+	}
+
+	// config value overrides the host fallback
+	if spec, _, ok := defaultResolver.Resolve("fast"); !ok || spec != "anthropic/claude-haiku-4-5" {
+		t.Fatalf("fast resolve = %q ok=%v; config should override fallback", spec, ok)
+	}
+	// fallback used when config has no override for the key
+	if spec, _, ok := defaultResolver.Resolve("thinking"); !ok || spec != "anthropic/claude-opus-4-8" {
+		t.Fatalf("thinking resolve = %q ok=%v; should use fallback", spec, ok)
+	}
+	// unknown tier
+	if _, _, ok := defaultResolver.Resolve("nope"); ok {
+		t.Fatal("Resolve of unknown tier should be ok=false")
+	}
+}
+
+// TestReasoningSuffixOnTier verifies the reasoning-suffix dialect survives the
+// move: a tier whose spec carries ":high" yields the bare spec + level "high".
+func TestReasoningSuffixOnTier(t *testing.T) {
+	Configure(nil, map[string]string{"thinking": "anthropic/claude-opus-4-8:high"}, time.Minute)
+	defer Configure(nil, nil, 0)
+
+	spec, level, ok := defaultResolver.Resolve("thinking")
+	if !ok {
+		t.Fatal("thinking should resolve")
+	}
+	if spec != "anthropic/claude-opus-4-8" {
+		t.Errorf("spec = %q, want suffix stripped", spec)
+	}
+	if level != "high" {
+		t.Errorf("reasoning level = %q, want high", level)
+	}
+}
+
+func TestValidateTierValueRejectsNestedTier(t *testing.T) {
+	Configure(nil, map[string]string{"fast": "x/y"}, time.Minute)
+	defer Configure(nil, nil, 0)
+
+	if err := ValidateTierValue("fast,a/b"); err == nil {
+		t.Error("a chain containing a tier alias must be rejected")
+	}
+	if err := ValidateTierValue("a/b,c/d"); err != nil {
+		t.Errorf("a chain of concrete specs must validate, got %v", err)
+	}
+}
+
+// TestSinksDefaultNil verifies usage/trace recording is inert with no sinks
+// installed (the light-host default).
+func TestSinksDefaultNil(t *testing.T) {
+	SetUsageSink(nil)
+	SetTraceSink(nil)
+	if TraceSinkActive() {
+		t.Error("no trace sink should mean inactive")
+	}
+	// recordUsage must be a no-op (no panic) with a nil sink.
+	recordUsage(WithModel(t.Context(), "x"), nil)
+}
@@ -0,0 +1,91 @@
+// Package llms — lane_mapping.go: maps a model spec to a stable lane
+// name. Pure data + a single function; no dependency on the registry,
+// no provider wrapping. Kept separate from lane_transport.go so the
+// mapping table can be committed and reviewed in isolation, and so
+// admin / webui code that just wants to *display* lane assignments
+// doesn't drag in the transport machinery.
+//
+// Why a fixed table: provider concurrency caps differ — Ollama Pro is
+// 3 connections, Anthropic Claude has higher per-tier limits, etc.
+// Each provider gets its own lane name so they can be configured
+// independently via convars (lanes.<name>.max_concurrent). Lane names
+// are user-facing (admin dashboard + convar key suffixes) and need to
+// stay stable across deploys; an env-overridable map adds complexity
+// for no current benefit.
+//
+// Test: lane_transport_test.go covers TestLaneFor_Mapping.
+package model
+
+import "strings"
+
+// Lane name constants. Defined as exported strings so admin code (.skill
+// admin set-lane <skill> <lane>), webui dropdowns, and convar consumers
+// share a single canonical spelling.
+const (
+	// LaneOllama covers ollama-cloud/* (and any future ollama/* local).
+	// The local ollama instance is on the same physical resource as
+	// the cloud account from mort's perspective — the connection cap
+	// should apply jointly.
+	LaneOllama = "ollama"
+
+	// LaneAnthropicThinking is the lane for Anthropic models in
+	// extended-thinking mode. Separated from default because thinking
+	// requests hold connections longer and can starve faster lanes
+	// when multiplexed.
+	LaneAnthropicThinking = "anthropic-thinking"
+
+	// LaneAnthropicDefault is the lane for non-thinking Anthropic
+	// requests (haiku, sonnet, opus without -thinking-).
+	LaneAnthropicDefault = "anthropic-default"
+
+	// LaneM1 is the lane for m1/* models (foreman-style router
+	// pointing at a dedicated local instance). Separated from the
+	// ollama lane because m1 targets a distinct host with its own
+	// connection budget.
+	LaneM1 = "m1"
+
+	// LaneLLMDefault is the catch-all lane for any provider/model
+	// combination not explicitly mapped above.
+	LaneLLMDefault = "llm-default"
+)
+
+// LaneFor returns the lane name for the given model spec. Mapping:
+//
+//	ollama-cloud/*               → "ollama"          (Pro account: 3 connections)
+//	anthropic/*-thinking-*       → "anthropic-thinking"
+//	anthropic/*                  → "anthropic-default"
+//	(anything else)              → "llm-default"
+//
+// Tier aliases (fast/standard/thinking) flow through this function as
+// the resolver's expanded provider/model spec, so callers don't need
+// to think about tier indirection. Empty input falls through to
+// LaneLLMDefault rather than panicking — defensive against unset
+// model specs in edge-case test wiring.
+//
+// Substring match for "-thinking-" keeps future Anthropic naming
+// variations classified correctly without churning this table on
+// every model release.
+func LaneFor(modelSpec string) string {
+	s := strings.TrimSpace(modelSpec)
+	if strings.HasPrefix(s, "ollama-cloud/") {
+		return LaneOllama
+	}
+	if strings.HasPrefix(s, "anthropic/") {
+		if strings.Contains(s, "-thinking-") {
+			return LaneAnthropicThinking
+		}
+		return LaneAnthropicDefault
+	}
+	// Foreman instances are backed by Ollama and share its connection
+	// cap, so they route to the same lane.
+	if strings.HasPrefix(s, "foreman/") {
+		return LaneOllama
+	}
+	// m1/ is a foreman-style router pointing at a dedicated local
+	// instance with its own connection budget. Separate lane so its
+	// concurrency cap is independent of the shared ollama lane.
+	if strings.HasPrefix(s, "m1/") {
+		return LaneM1
+	}
+	return LaneLLMDefault
+}
@@ -0,0 +1,373 @@
+// Package llms — lane_transport.go: the lane-aware decorator. Wraps an
+// llm.Provider so every model it mints submits its Generate/Stream calls
+// through the matching named lane's bounded worker pool (lane selection
+// per lane_mapping.go), and stamps every returned error with per-call
+// attribution (caller id, run id, prompt snapshot) for the failover log.
+//
+// Why intercept at the llm.Provider layer: majordomo's Provider and Model
+// are small public interfaces, so the decorator slots between the chain
+// executor and the real provider with no fork. Every chain attempt calls
+// laneModel.Generate, which queues on the lane, runs the real call, and
+// wraps failures with CallInfo — the ChainConfig.Observer (which receives
+// no context) recovers the attribution from the error itself.
+//
+// Test: lane_transport_test.go covers mapping correctness, the
+// concurrency-limiting behavior, and error attribution.
+// lane_chatbot_test.go is the regression guard proving chatbot-path LLM
+// calls actually go through the lane.
+package model
+
+import (
+	"context"
+	"errors"
+	"time"
+
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"github.com/google/uuid"
+
+	"gitea.stevedudenhoeffer.com/steve/executus/lane"
+)
+
+// defaultLaneExecTimeout is the execution backstop applied inside a lane
+// job once it leaves the queue: the caller's deadline is detached (queue
+// wait must not consume the LLM execution budget) and replaced with this
+// hard cap so a hung provider can't leak workers.
+const defaultLaneExecTimeout = 5 * time.Minute
+
+// foremanModelTimeout is the hard per-call timeout for foreman targets —
+// slow local LLMs that may block on model loads and upstream queues.
+const foremanModelTimeout = 30 * time.Minute
+
+// foremanLaneExecTimeout is the lane execution backstop for foreman
+// targets. Slightly above foremanModelTimeout so the model-level timeout
+// (the documented contract) is the one that fires.
+const foremanLaneExecTimeout = foremanModelTimeout + time.Minute
+
+// laneCallerKey is the context key for the per-call caller identity used
+// for fair-share queueing.
+type laneCallerKey struct{}
+
+// runIDKey is the context key for the per-call run id used for failover
+// event attribution.
+type runIDKey struct{}
+
+// ContextWithLaneCaller attaches a caller identity to ctx. The lane
+// decorator reads this when constructing a Job so fair-share queueing
+// can isolate heavy users, and snapshots it into error attribution for
+// the failover log.
+//
+// Empty string is a no-op and lumps every empty-caller invocation into a
+// single fair-share bucket; production callers should always populate it.
+func ContextWithLaneCaller(ctx context.Context, callerID string) context.Context {
+	if callerID == "" {
+		return ctx
+	}
+	return context.WithValue(ctx, laneCallerKey{}, callerID)
+}
+
+// LaneCallerFromContext returns the caller identity attached via
+// ContextWithLaneCaller, or "" if none is set.
+func LaneCallerFromContext(ctx context.Context) string {
+	s, _ := ctx.Value(laneCallerKey{}).(string)
+	return s
+}
+
+// ContextWithRunID attaches a skill/agent run id to ctx. Snapshotted into
+// error attribution so failover events can be correlated to runs.
+func ContextWithRunID(ctx context.Context, runID string) context.Context {
+	if runID == "" {
+		return ctx
+	}
+	return context.WithValue(ctx, runIDKey{}, runID)
+}
+
+// RunIDFromContext returns the run id attached via ContextWithRunID, or
+// "" if none is set.
+func RunIDFromContext(ctx context.Context) string {
+	s, _ := ctx.Value(runIDKey{}).(string)
+	return s
+}
+
+// ---------------------------------------------------------------------------
+// Error attribution
+// ---------------------------------------------------------------------------
+
+// CallInfo is the per-call attribution snapshot the lane decorator stamps
+// onto every error it returns. majordomo's ChainConfig.Observer receives
+// a bare FailoverEvent (no context); the failover log recovers caller,
+// run id, and the prompt chain from the event's error via
+// CallInfoFromError.
+type CallInfo struct {
+	// CallerID is the fair-share caller identity (ContextWithLaneCaller).
+	CallerID string
+	// RunID is the skill/agent run id (ContextWithRunID); "" if not threaded.
+	RunID string
+	// Messages is the request's message chain at call time, for the
+	// failover log's persist_prompts feature.
+	Messages []llm.Message
+}
+
+// callInfoError carries CallInfo along an error chain without changing
+// the error's message or classification (Unwrap preserves errors.Is/As).
+type callInfoError struct {
+	inner error
+	info  CallInfo
+}
+
+func (e *callInfoError) Error() string { return e.inner.Error() }
+func (e *callInfoError) Unwrap() error { return e.inner }
+
+// WithCallInfo stamps attribution onto err. nil err returns nil.
+func WithCallInfo(err error, info CallInfo) error {
+	if err == nil {
+		return nil
+	}
+	return &callInfoError{inner: err, info: info}
+}
+
+// CallInfoFromError extracts the attribution stamped by the lane
+// decorator (or WithCallInfo), if any.
+func CallInfoFromError(err error) (CallInfo, bool) {
+	var cie *callInfoError
+	if errors.As(err, &cie) {
+		return cie.info, true
+	}
+	return CallInfo{}, false
+}
+
+// ---------------------------------------------------------------------------
+// Lane decoration
+// ---------------------------------------------------------------------------
+
+// LaneRegistry is the narrow surface the lane decorator needs from
+// pkg/lane.Registry. Defined as an interface so tests can substitute a
+// fake registry without spinning up a real one.
+type LaneRegistry interface {
+	GetOrCreate(ctx context.Context, name string) lane.Lane
+}
+
+// laneProvider decorates an llm.Provider so every model it mints routes
+// calls through the lane named by LaneFor(provider/model). With a nil
+// registry the queueing is skipped but error attribution still applies.
+type laneProvider struct {
+	inner       llm.Provider
+	registry    LaneRegistry
+	execTimeout time.Duration
+}
+
+// WrapProviderForLane returns a provider whose models submit each
+// Generate/Stream call through the lane named by LaneFor(name/model) in
+// the registry, and stamp CallInfo attribution onto every error.
+//
+// A nil registry disables queueing (calls pass straight through) but the
+// decoration — and with it error attribution — remains, so failover
+// logging works in lane-less deployments and tests.
+func WrapProviderForLane(inner llm.Provider, registry LaneRegistry) llm.Provider {
+	return wrapProviderForLane(inner, registry, defaultLaneExecTimeout)
+}
+
+func wrapProviderForLane(inner llm.Provider, registry LaneRegistry, execTimeout time.Duration) llm.Provider {
+	if inner == nil {
+		return nil
+	}
+	if execTimeout <= 0 {
+		execTimeout = defaultLaneExecTimeout
+	}
+	return &laneProvider{inner: inner, registry: registry, execTimeout: execTimeout}
+}
+
+func (p *laneProvider) Name() string { return p.inner.Name() }
+
+func (p *laneProvider) Model(id string, opts ...llm.ModelOption) (llm.Model, error) {
+	m, err := p.inner.Model(id, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return &laneModel{
+		inner:       m,
+		registry:    p.registry,
+		laneName:    LaneFor(p.inner.Name() + "/" + id),
+		execTimeout: p.execTimeout,
+	}, nil
+}
+
+// laneModel routes one model's calls through its lane and stamps error
+// attribution. The lane name is resolved once at Model() time — the
+// provider name and model id are both known there, unlike legacy gollm where
+// the request had to be inspected per call.
+type laneModel struct {
+	inner       llm.Model
+	registry    LaneRegistry
+	laneName    string
+	execTimeout time.Duration
+}
+
+func (m *laneModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }
+
+// laneJob adapts an in-flight call to the lane.Job interface. The result
+// is captured into the struct and read after SubmitWait returns.
+type laneJob struct {
+	id       string
+	callerID string
+	run      func(ctx context.Context) error
+}
+
+func (j *laneJob) ID() string                    { return j.id }
+func (j *laneJob) CallerID() string              { return j.callerID }
+func (j *laneJob) Priority() int                 { return 0 }
+func (j *laneJob) Run(ctx context.Context) error { return j.run(ctx) }
+
+func (m *laneModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
+	// Fold options now so the job closure and the attribution snapshot
+	// both see the final request.
+	req = req.Apply(opts...)
+	info := CallInfo{
+		CallerID: LaneCallerFromContext(ctx),
+		RunID:    RunIDFromContext(ctx),
+		Messages: req.Messages,
+	}
+
+	resp, err := m.submit(ctx, func(execCtx context.Context) (*llm.Response, error) {
+		return m.inner.Generate(execCtx, req)
+	})
+	if err != nil {
+		return resp, WithCallInfo(err, info)
+	}
+	return resp, nil
+}
+
+func (m *laneModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
+	req = req.Apply(opts...)
+	info := CallInfo{
+		CallerID: LaneCallerFromContext(ctx),
+		RunID:    RunIDFromContext(ctx),
+		Messages: req.Messages,
+	}
+
+	l := m.lane(ctx)
+	if l == nil {
+		s, err := m.inner.Stream(ctx, req)
+		if err != nil {
+			return nil, WithCallInfo(err, info)
+		}
+		return s, nil
+	}
+
+	// Streams hold their lane slot only while ESTABLISHING the stream —
+	// holding it for the full consumption would deadlock a slow consumer
+	// against the pool. The caller's ctx is used as-is (no deadline
+	// detach): severing cancellation from a long-lived stream would leak
+	// connections.
+	var (
+		stream llm.Stream
+		serr   error
+	)
+	job := &laneJob{
+		id:       uuid.New().String(),
+		callerID: info.CallerID,
+		run: func(context.Context) error {
+			stream, serr = m.inner.Stream(ctx, req)
+			return serr
+		},
+	}
+	if err := l.SubmitWait(ctx, job); err != nil {
+		return nil, WithCallInfo(err, info)
+	}
+	if serr != nil {
+		return nil, WithCallInfo(serr, info)
+	}
+	return stream, nil
+}
+
+// lane resolves the lane for this model, or nil when queueing is
+// disabled (nil registry, or a registry that declines the name).
+func (m *laneModel) lane(ctx context.Context) lane.Lane {
+	if m.registry == nil {
+		return nil
+	}
+	return m.registry.GetOrCreate(ctx, m.laneName)
+}
+
+// submit runs fn through the lane (or directly when queueing is off).
+//
+// Inside a lane job the caller's deadline is detached so queue wait does
+// not consume the execution budget — ctx VALUES (usage attribution,
+// trace ids) are preserved, only cancellation/deadline are severed — and
+// an execTimeout backstop prevents runaway calls. Queue-phase
+// cancellation still works: SubmitWait waits on the original ctx, so a
+// caller that gives up while queued exits immediately.
+func (m *laneModel) submit(ctx context.Context, fn func(context.Context) (*llm.Response, error)) (*llm.Response, error) {
+	l := m.lane(ctx)
+	if l == nil {
+		return fn(ctx)
+	}
+	var (
+		resp *llm.Response
+		err  error
+	)
+	job := &laneJob{
+		id:       uuid.New().String(),
+		callerID: LaneCallerFromContext(ctx),
+		run: func(context.Context) error {
+			execCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), m.execTimeout)
+			defer cancel()
+			resp, err = fn(execCtx)
+			// Returning err lets the lane's pool propagate it to
+			// SubmitWait; the captured err is what we surface.
+			return err
+		},
+	}
+	if serr := l.SubmitWait(ctx, job); serr != nil && err == nil {
+		return nil, serr
+	}
+	return resp, err
+}
+
+// ---------------------------------------------------------------------------
+// Model timeout decoration (foreman)
+// ---------------------------------------------------------------------------
+
+// timeoutProvider wraps a provider so every minted model enforces a hard
+// per-call deadline on Generate. Used for foreman targets (slow local
+// LLMs). Stream is passed through: a wall-clock deadline on a long-lived
+// stream would sever it mid-consumption.
+type timeoutProvider struct {
+	inner   llm.Provider
+	timeout time.Duration
+}
+
+// withModelTimeout decorates p so its models' Generate calls carry a
+// hard timeout.
+func withModelTimeout(p llm.Provider, d time.Duration) llm.Provider {
+	if p == nil || d <= 0 {
+		return p
+	}
+	return &timeoutProvider{inner: p, timeout: d}
+}
+
+func (p *timeoutProvider) Name() string { return p.inner.Name() }
+
+func (p *timeoutProvider) Model(id string, opts ...llm.ModelOption) (llm.Model, error) {
+	m, err := p.inner.Model(id, opts...)
+	if err != nil {
+		return nil, err
+	}
+	return &timeoutModel{inner: m, timeout: p.timeout}, nil
+}
+
+type timeoutModel struct {
+	inner   llm.Model
+	timeout time.Duration
+}
+
+func (m *timeoutModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }
+
+func (m *timeoutModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
+	ctx, cancel := context.WithTimeout(ctx, m.timeout)
+	defer cancel()
+	return m.inner.Generate(ctx, req, opts...)
+}
+
+func (m *timeoutModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
+	return m.inner.Stream(ctx, req, opts...)
+}
@@ -0,0 +1,477 @@
+// Package model is executus's config-driven model-access layer over majordomo: it owns the
+// package-level *majordomo.Registry (providers with mort's env keys,
+// OpenAI-compat presets, lane-aware decoration, the DB-backed tier
+// resolver, legacy shortcut aliases, the foreman timeout decorator, and
+// failover/health wiring), plus the mort-facing call helpers
+// (ParseModelRequest / ParseModelForContext / GenerateWith /
+// CallAndExecute / SimpleCall) and usage/trace recording.
+//
+// The ":low/:medium/:high" reasoning-suffix dialect is an executus convenience:
+// majordomo treats model ids as verbatim, so this package strips the
+// suffix from specs and tier values and re-applies it per request via
+// llm.WithReasoningEffort on a wrapping Model.
+package model
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"strings"
+	"sync"
+	"time"
+
+	majordomo "gitea.stevedudenhoeffer.com/steve/majordomo"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/health"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/llm"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/anthropic"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/google"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama"
+	"gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai"
+)
+
+// Usage and trace recording live in sink.go: SetUsageSink / SetTraceSink
+// install the host seams, and ParseModelForContext stamps the model name on
+// the context (via WithModel) for attribution.
+
+// ---------------------------------------------------------------------------
+// Package registry
+// ---------------------------------------------------------------------------
+
+// buildConfig carries the knobs Wire feeds into buildRegistry. The zero
+// value yields a lane-less registry with majordomo's default failover
+// behavior — the bootstrap state tests and pre-Wire code paths run on.
+type buildConfig struct {
+	lanes LaneRegistry
+
+	// maxRetries maps the llms.failover.max_retries convar onto
+	// ChainConfig.TransientRetries. <= 0 keeps majordomo's default (1).
+	maxRetries int
+
+	// cooldown maps the llms.failover.cooldown_seconds convar onto
+	// health.Config.BaseCooldown. <= 0 keeps the mort default (300s).
+	// Note majordomo grows the cooldown exponentially from this base;
+	// MaxCooldown is set to max(cooldown, 5m) so the operator dial
+	// dominates (a 10m base never gets capped below itself).
+	cooldown time.Duration
+
+	// observer receives one event per failover decision (failed attempt,
+	// bench, benched-skip). Typically failoverlog.NewObserver(...).
+	observer func(majordomo.FailoverEvent)
+}
+
+// defaultFailoverCooldown matches the historical llms.failover.cooldown_seconds
+// convar default (300s).
+const defaultFailoverCooldown = 300 * time.Second
+
+var (
+	registryMu sync.RWMutex
+	registry   = buildRegistry(buildConfig{})
+)
+
+// Registry returns the current package-level majordomo registry. Most
+// callers should use ParseModelRequest / ParseModelForContext instead;
+// the registry itself is exposed for admin surfaces (health/bench) and
+// for tests that need to substitute providers.
+func Registry() *majordomo.Registry {
+	registryMu.RLock()
+	defer registryMu.RUnlock()
+	return registry
+}
+
+// Health returns the health tracker of the current registry — the live
+// source of truth for benched models. Used by the `.failover` commands
+// and the failover web UI (see ListBenched/BenchModel/UnbenchModel for
+// the mort-flavored facade).
+func Health() *health.Tracker {
+	return Registry().Health()
+}
+
+// setRegistry swaps the package registry. Bench/backoff state of the old
+// registry is discarded — Wire is a boot-time operation.
+func setRegistry(r *majordomo.Registry) {
+	registryMu.Lock()
+	defer registryMu.Unlock()
+	registry = r
+}
+
+// buildRegistry constructs a fully-wired majordomo registry:
+//
+//   - health/chain config from the failover convars (via cfg),
+//   - mort's providers under their nonstandard env keys (OPENAI_KEY,
+//     GOOGLE_GEMINI_API_KEY, ...), every one lane-decorated,
+//   - OpenAI-compat presets (deepseek, moonshot+kimi, xai+grok, groq),
+//   - scheme factories for LLM_* env DSNs re-registered so DSN-defined
+//     providers (m1, arbitrary foreman targets) are lane-decorated too,
+//     with foreman additionally getting the 30-minute model timeout,
+//   - the legacy shortcut aliases, and
+//   - the delegating tier resolver (reads defaultResolver at Resolve
+//     time, so Init() can swap in the DB-backed resolver later).
+func buildRegistry(cfg buildConfig) *majordomo.Registry {
+	cooldown := cfg.cooldown
+	if cooldown <= 0 {
+		cooldown = defaultFailoverCooldown
+	}
+	maxCooldown := cooldown
+	if maxCooldown < 5*time.Minute {
+		maxCooldown = 5 * time.Minute
+	}
+
+	r := majordomo.New(
+		// Env DSNs are loaded manually below, AFTER the scheme factories
+		// are overridden — New()'s eager scan would otherwise build
+		// LLM_*-defined providers with the stock (un-decorated) factories.
+		majordomo.WithoutEnvProviders(),
+		majordomo.WithHealthConfig(health.Config{
+			BaseCooldown: cooldown,
+			MaxCooldown:  maxCooldown,
+		}),
+		majordomo.WithChainConfig(majordomo.ChainConfig{
+			TransientRetries: cfg.maxRetries,
+			// legacy gollm failed over on request-specific errors (400/413/422)
+			// without benching; majordomo fails fast on permanent errors by
+			// default. AdvanceOnPermanent preserves the availability-first
+			// behavior mort's executors rely on.
+			AdvanceOnPermanent: true,
+			Observer:           cfg.observer,
+		}),
+	)
+
+	wrap := func(p llm.Provider) llm.Provider {
+		return wrapProviderForLane(p, cfg.lanes, defaultLaneExecTimeout)
+	}
+
+	// Core providers with mort's env keys.
+	r.RegisterProvider(wrap(openai.New(
+		openai.WithAPIKey(os.Getenv("OPENAI_KEY")),
+	)))
+	r.RegisterProvider(wrap(anthropic.New(
+		anthropic.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY")),
+	)))
+	r.RegisterProvider(wrap(google.New(
+		google.WithAPIKey(os.Getenv("GOOGLE_GEMINI_API_KEY")),
+	)))
+	r.RegisterProvider(wrap(localOllamaProvider()))
+	// ollama.Cloud reads OLLAMA_API_KEY itself; with the key unset the
+	// provider still registers and errors clearly at call time (parity
+	// with the previous behavior).
+	r.RegisterProvider(wrap(ollama.Cloud()))
+
+	// OpenAI-compatible presets. Base URLs mirror legacy gollm's defaults.
+	for _, preset := range []struct {
+		name, baseURL, envKey string
+	}{
+		{"deepseek", "https://api.deepseek.com/v1", "DEEPSEEK_API_KEY"},
+		{"moonshot", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"},
+		{"kimi", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, // alias provider for moonshot
+		{"xai", "https://api.x.ai/v1", "XAI_API_KEY"},
+		{"grok", "https://api.x.ai/v1", "XAI_API_KEY"}, // alias provider for xai
+		{"groq", "https://api.groq.com/openai/v1", "GROQ_API_KEY"},
+	} {
+		r.RegisterProvider(wrap(openai.New(
+			openai.WithName(preset.name),
+			openai.WithBaseURL(preset.baseURL),
+			openai.WithAPIKey(os.Getenv(preset.envKey)),
+		)))
+	}
+
+	// Scheme factories for LLM_* env DSNs. Re-registered so DSN-defined
+	// providers go through the lane decorator like the built-ins.
+	//
+	// foreman targets are slow local LLMs (large model loads, queued
+	// behind other requests), so their models additionally get a hard
+	// 30-minute timeout and a matching lane execution backstop — the
+	// default 5-minute lane backstop would strangle them.
+	r.RegisterScheme("foreman", func(name string, dsn majordomo.DSN) (llm.Provider, error) {
+		p := ollama.Foreman(dsn.BaseURL(), dsn.Token, ollama.WithName(name))
+		return wrapProviderForLane(
+			withModelTimeout(p, foremanModelTimeout),
+			cfg.lanes,
+			foremanLaneExecTimeout,
+		), nil
+	})
+	laneScheme := func(factory majordomo.SchemeFactory) majordomo.SchemeFactory {
+		return func(name string, dsn majordomo.DSN) (llm.Provider, error) {
+			p, err := factory(name, dsn)
+			if err != nil {
+				return nil, err
+			}
+			return wrap(p), nil
+		}
+	}
+	ollamaScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
+		return ollama.New(
+			ollama.WithName(name),
+			ollama.WithBaseURL(dsn.BaseURL()),
+			ollama.WithToken(dsn.Token),
+		), nil
+	})
+	r.RegisterScheme("ollama", ollamaScheme)
+	r.RegisterScheme("ollama-cloud", ollamaScheme)
+	r.RegisterScheme("openai", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
+		return openai.New(
+			openai.WithName(name),
+			openai.WithBaseURL(dsn.BaseURL()),
+			openai.WithAPIKey(dsn.Token),
+		), nil
+	}))
+	r.RegisterScheme("anthropic", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
+		return anthropic.New(
+			anthropic.WithName(name),
+			anthropic.WithBaseURL(dsn.BaseURL()),
+			anthropic.WithAPIKey(dsn.Token),
+		), nil
+	}))
+	googleScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) {
+		return google.New(
+			google.WithName(name),
+			google.WithBaseURL(dsn.BaseURL()),
+			google.WithAPIKey(dsn.Token),
+		), nil
+	})
+	r.RegisterScheme("google", googleScheme)
+	r.RegisterScheme("gemini", googleScheme)
+
+	// Eager LLM_* env scan, now with the decorated scheme factories in
+	// place. Malformed entries are recorded per-name and surface on use.
+	env := make(map[string]string)
+	for _, kv := range os.Environ() {
+		if k, v, ok := strings.Cut(kv, "="); ok {
+			env[k] = v
+		}
+	}
+	_ = r.LoadEnv(env)
+
+	// Legacy shortcut aliases (sonnet, haiku, ...). Same strings as the
+	// historical table; kept in sync with legacyAliasSpecs below.
+	for name, spec := range legacyAliasSpecs {
+		r.RegisterAlias(name, spec)
+	}
+
+	// Tier resolver: a delegating closure so Init() and test helpers can
+	// swap defaultResolver without rebuilding the registry. The resolver
+	// returns specs with the legacy reasoning suffixes already stripped
+	// (per chain element); the tier's default reasoning level is applied
+	// by ParseModelRequest, not here.
+	r.RegisterResolver(majordomo.ResolverFunc(func(name string) (string, bool) {
+		res := defaultResolver
+		if res == nil {
+			return "", false
+		}
+		spec, _, ok := res.Resolve(name)
+		return spec, ok
+	}))
+
+	return r
+}
+
+// localOllamaProvider builds the local Ollama provider, honoring
+// OLLAMA_BASE_URL when set (mort's historical env var; ollama.Local
+// itself honors OLLAMA_HOST).
+func localOllamaProvider() llm.Provider {
+	if url := os.Getenv("OLLAMA_BASE_URL"); url != "" {
+		return ollama.Local(ollama.WithBaseURL(url))
+	}
+	return ollama.Local()
+}
+
+// ---------------------------------------------------------------------------
+// Spec parsing
+// ---------------------------------------------------------------------------
+
+// ParseModelRequest resolves a model request string to a ready-to-use Model.
+// It handles, in order:
+//
+//   - empty spec → tier "fast"
+//   - the legacy ":low/:medium/:high" reasoning suffix, stripped per chain
+//     element (ollama tags like ":30b" or ":cloud" are preserved); the
+//     level is applied to every call via llm.WithReasoningEffort
+//   - tier aliases (DB-backed convars; a tier value's own suffix becomes
+//     the default level when the caller didn't supply one)
+//   - legacy shortcut aliases (sonnet, haiku, opus, ...)
+//   - provider/model lookup and LLM_* env-DSN fallback (majordomo)
+//   - comma-separated failover chains with health-tracked bench/backoff
+//
+// The returned Model is instrumented: token usage from every successful
+// Generate is recorded to the package usage recorder automatically. Do
+// NOT additionally call RecordUsage on responses from a parsed model.
+func ParseModelRequest(spec string) (majordomo.Model, error) {
+	spec = strings.TrimSpace(spec)
+	if spec == "" {
+		spec = "fast"
+	}
+
+	clean, level := splitReasoningSpec(spec)
+
+	// Tier default reasoning: when the (suffix-free) spec is exactly a
+	// tier name and the caller didn't ask for a level, the tier value's
+	// own suffix (e.g. "anthropic/claude-opus-4-6:high") applies.
+	if level == "" && defaultResolver != nil {
+		if _, tierLevel, ok := defaultResolver.Resolve(clean); ok {
+			level = tierLevel
+		}
+	}
+
+	m, err := Registry().Parse(clean)
+	if err != nil {
+		return nil, fmt.Errorf("model %q: %w", spec, err)
+	}
+	if level != "" {
+		m = &reasoningModel{inner: m, level: level}
+	}
+	return &instrumentedModel{inner: m}, nil
+}
+
+// ParseModelForContext combines ParseModelRequest with llmusage.WithModel so
+// that the resolved model name is recorded in the context for usage tracking.
+// Prefer this over bare ParseModelRequest in all new code.
+func ParseModelForContext(ctx context.Context, req string) (context.Context, majordomo.Model, error) {
+	model, err := ParseModelRequest(req)
+	if err != nil {
+		return ctx, nil, err
+	}
+	ctx = WithModel(ctx, ResolveModelName(req))
+	return ctx, model, nil
+}
+
+// reasoningModel applies a default reasoning effort to every request that
+// doesn't carry one already. Mort's legacy ":low/:medium/:high" suffix
+// dialect resolves to this wrapper because majordomo treats model ids as
+// verbatim (no suffix stripping).
+type reasoningModel struct {
+	inner llm.Model
+	level string
+}
+
+func (m *reasoningModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) {
+	req = req.Apply(opts...)
+	if req.ReasoningEffort == "" {
+		req.ReasoningEffort = m.level
+	}
+	return m.inner.Generate(ctx, req)
+}
+
+func (m *reasoningModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) {
+	req = req.Apply(opts...)
+	if req.ReasoningEffort == "" {
+		req.ReasoningEffort = m.level
+	}
+	return m.inner.Stream(ctx, req)
+}
+
+func (m *reasoningModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() }
+
+// ---------------------------------------------------------------------------
+// Reasoning-suffix dialect
+// ---------------------------------------------------------------------------
+
+// reasoningLevels is the set of recognized legacy suffix values.
+var reasoningLevels = map[string]bool{"low": true, "medium": true, "high": true}
+
+// splitReasoning peels an optional ":low" / ":medium" / ":high" suffix off
+// a single model request string. Returns the input unchanged and "" when no
+// recognized level is present, so non-reasoning suffixes (ollama tags like
+// ":30b" or ":q4_K_M", date stamps) flow through untouched.
+func splitReasoning(s string) (string, string) {
+	idx := strings.LastIndex(s, ":")
+	if idx < 0 {
+		return s, ""
+	}
+	if lvl := s[idx+1:]; reasoningLevels[lvl] {
+		return s[:idx], lvl
+	}
+	return s, ""
+}
+
+// splitReasoningSpec strips the legacy reasoning suffix from every element
+// of a (possibly comma-separated) spec. The returned level is the first
+// non-empty per-element level — majordomo chains carry one request-level
+// reasoning effort, not one per target, so the head element's preference
+// wins. Elements without a suffix are unchanged.
+func splitReasoningSpec(spec string) (string, string) {
+	if !strings.Contains(spec, ",") {
+		return splitReasoning(strings.TrimSpace(spec))
+	}
+	parts := strings.Split(spec, ",")
+	level := ""
+	for i, p := range parts {
+		s, l := splitReasoning(strings.TrimSpace(p))
+		parts[i] = s
+		if level == "" {
+			level = l
+		}
+	}
+	return strings.Join(parts, ","), level
+}
+
+// ---------------------------------------------------------------------------
+// Usage-attribution name resolution
+// ---------------------------------------------------------------------------
+
+// ResolveModelName returns the model portion of a request string, stripping
+// any reasoning suffix and resolving tier aliases. The result is used for
+// usage attribution (keyed on model name, not provider or reasoning level).
+func ResolveModelName(req string) string {
+	// Strip any reasoning-level suffix before resolving — the level is a
+	// per-request setting, not part of the model identity.
+	req, _ = splitReasoning(req)
+
+	// Tier expansion: when the request is a tier alias, fold it through the
+	// resolver and return the model portion of its current convar value. The
+	// empty string is treated as "fast" for compatibility with callers that
+	// pre-resolution defaulted to fast.
+	if defaultResolver != nil {
+		key := req
+		if key == "" {
+			key = "fast"
+		}
+		if spec, _, ok := defaultResolver.Resolve(key); ok && spec != "" {
+			// A tier may resolve to a comma-separated failover chain. Attribute
+			// usage to the first (preferred) entry's model name rather than the
+			// whole chain string.
+			if i := strings.IndexByte(spec, ','); i >= 0 {
+				spec = strings.TrimSpace(spec[:i])
+			}
+			if idx := strings.Index(spec, "/"); idx >= 0 {
+				return spec[idx+1:]
+			}
+			return spec
+		}
+	}
+
+	// For non-tier requests, return the model portion after the slash.
+	// Static aliases are NOT expanded here beyond the legacy table below:
+	// callers that went through ParseModelRequest already carry the
+	// concrete spec.
+	if idx := strings.Index(req, "/"); idx >= 0 {
+		return req[idx+1:]
+	}
+
+	// Legacy shortcut fallback: callers that pass bare names like "sonnet"
+	// to ResolveModelName (without going through ParseModelRequest) still
+	// need the concrete model name for usage keys.
+	if spec, ok := legacyAliasSpecs[req]; ok {
+		if idx := strings.Index(spec, "/"); idx >= 0 {
+			return spec[idx+1:]
+		}
+		return spec
+	}
+
+	return req
+}
+
+// legacyAliasSpecs maps legacy shortcut names to their full provider/model
+// spec. Registered with the registry as static aliases AND consulted by
+// ResolveModelName for bare-name usage attribution.
+var legacyAliasSpecs = map[string]string{
+	"openai":       "openai/gpt-4o-mini",
+	"gpt-4":        "openai/gpt-4",
+	"gpt-4o":       "openai/gpt-4o",
+	"gpt-4o-mini":  "openai/gpt-4o-mini",
+	"sonnet":       "anthropic/claude-sonnet-4-6",
+	"sonnet-4.5":   "anthropic/claude-sonnet-4-5-20250929",
+	"haiku":        "anthropic/claude-haiku-4-5-20251001",
+	"opus":         "anthropic/claude-opus-4-6",
+	"gemini":       "google/gemini-2.0-flash",
+	"gemini-flash": "google/gemini-2.0-flash",
+	"gemini-pro":   "google/gemini-2.0-pro",
+}
@@ -0,0 +1,131 @@
+package model
+
+import (
+	"context"
+	"time"
+)
+
+// This file is executus's inversion of mort's llmusage / llmtrace coupling.
+// The model package owns the MECHANISM (instrument every parsed model's
+// Generate, attribute by serving model, emit a span when a trace is active);
+// WHERE usage/traces land is a host seam. A host registers a UsageSink and/or
+// a TraceSink; both are optional (nil = off), so a light host records nothing.
+
+// --- Usage ---
+
+// UsageSink receives one record per successful Generate through a model parsed
+// by this package (ParseModelRequest / ParseModelForContext). Implement it to
+// meter or bill; the token detail mirrors majordomo's Response.Usage.
+type UsageSink interface {
+	Record(ctx context.Context, model string, inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens int)
+}
+
+var usageSink UsageSink
+
+// SetUsageSink installs the usage sink (nil disables usage recording). Call at
+// startup before model calls.
+func SetUsageSink(s UsageSink) { usageSink = s }
+
+// --- Trace ---
+
+// Span is one traced model call. The host's TraceSink persists it however it
+// likes (a DB row, a log line, an OTel span). String fields carrying structured
+// data (Messages, ToolDefinitions, ...) are pre-marshalled JSON.
+type Span struct {
+	SpanID  string
+	TraceID string
+	Model   string
+
+	SystemPrompt      string
+	Messages          string
+	ToolDefinitions   string
+	ResponseText      string
+	ResponseToolCalls string
+	ToolResults       string
+	Error             string
+
+	InputTokens  int
+	OutputTokens int
+	DurationMs   int64
+
+	StartedAt   time.Time
+	CompletedAt time.Time
+	CreatedAt   time.Time
+}
+
+// TraceSink receives a Span for each traced call (one is emitted only when a
+// trace id is present on the context — see WithTraceID).
+type TraceSink interface {
+	WriteSpan(span Span)
+}
+
+var traceSink TraceSink
+
+// SetTraceSink installs the trace sink (nil disables tracing).
+func SetTraceSink(s TraceSink) { traceSink = s }
+
+// TraceSinkActive reports whether a trace sink is installed.
+func TraceSinkActive() bool { return traceSink != nil }
+
+// --- Context attribution ---
+//
+// ParseModelForContext stamps the requested model onto the context so usage
+// from a response that doesn't name its serving model can still be attributed.
+// A host's tracing/usage middleware stamps a trace id and optional caller/tool
+// for diagnostics. All reads are nil/empty-safe.
+
+type (
+	ctxKeyModel struct{}
+	ctxKeyTrace struct{}
+	ctxKeyTool  struct{}
+	ctxKeyUser  struct{}
+)
+
+// WithModel attributes subsequent usage on ctx to the given model name.
+func WithModel(ctx context.Context, model string) context.Context {
+	return context.WithValue(ctx, ctxKeyModel{}, model)
+}
+
+func modelFromContext(ctx context.Context) string {
+	if v, ok := ctx.Value(ctxKeyModel{}).(string); ok {
+		return v
+	}
+	return ""
+}
+
+// WithTraceID marks ctx as belonging to a trace; a TraceSink (if installed)
+// then receives a Span per call. An empty id (or no id) disables tracing.
+func WithTraceID(ctx context.Context, id string) context.Context {
+	return context.WithValue(ctx, ctxKeyTrace{}, id)
+}
+
+func traceIDFromContext(ctx context.Context) string {
+	if v, ok := ctx.Value(ctxKeyTrace{}).(string); ok {
+		return v
+	}
+	return ""
+}
+
+// WithUsageTool / WithUsageUser attach optional attribution used only in the
+// "unknown model" diagnostic warning. Default "unknown".
+func WithUsageTool(ctx context.Context, tool string) context.Context {
+	return context.WithValue(ctx, ctxKeyTool{}, tool)
+}
+
+func toolFromContext(ctx context.Context) string {
+	if v, ok := ctx.Value(ctxKeyTool{}).(string); ok && v != "" {
+		return v
+	}
+	return "unknown"
+}
+
+func WithUsageUser(ctx context.Context, user string) context.Context {
+	return context.WithValue(ctx, ctxKeyUser{}, user)
+}
+
+func userFromContext(ctx context.Context) string {
+	if v, ok := ctx.Value(ctxKeyUser{}).(string); ok && v != "" {
+		return v
+	}
+	return "unknown"
+}
@@ -0,0 +1,162 @@
+package model
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"gitea.stevedudenhoeffer.com/steve/executus/config"
+)
+
+// tierResolver expands tier aliases (e.g. "fast", "thinking", "agent-working")
+// into a concrete model spec or a comma-separated failover chain. The set of
+// tier names and their FALLBACK specs are host-supplied (a map passed at
+// Configure time); the live value of each tier is read from a config.Source
+// under the key "model.tier.<name>", so a host whose config backend mutates at
+// runtime (mort's convar) re-targets tiers without a restart, while a static
+// host (gadfly's env) just gets the fallback. A small in-process cache (TTL
+// from "model.tier.cache_ttl_seconds", default 30s) saves config round-trips on
+// the hot path; ReloadTiers clears it.
+//
+// This is executus's inversion of mort's convar-bound resolver: the MECHANISM
+// (tier lookup, reasoning-suffix dialect, chain validation, cache) is generic;
+// the tier MAP content (which tiers exist + their default specs) is host config.
+type tierResolver struct {
+	cfg      config.Source
+	defaults map[string]string // tier name -> fallback spec
+	ttl      time.Duration
+	mu       sync.RWMutex
+	cache    map[string]tierEntry
+	now      func() time.Time // overridable for tests
+}
+
+type tierEntry struct {
+	spec      string
+	reasoning string
+	expires   time.Time
+}
+
+const tierConfigPrefix = "model.tier."
+
+// NewTierResolver builds a resolver over cfg with the given tier defaults
+// (name -> fallback spec). cfg may be nil (the fallbacks are then always used).
+// ttl<=0 reads "model.tier.cache_ttl_seconds" (default 30s).
+func NewTierResolver(cfg config.Source, defaults map[string]string, ttl time.Duration) *tierResolver {
+	if ttl <= 0 {
+		ttl = time.Duration(config.Int(cfg, tierConfigPrefix+"cache_ttl_seconds", 30)) * time.Second
+	}
+	if ttl <= 0 {
+		ttl = 30 * time.Second
+	}
+	cp := make(map[string]string, len(defaults))
+	for k, v := range defaults {
+		cp[k] = v
+	}
+	return &tierResolver{
+		cfg:      cfg,
+		defaults: cp,
+		ttl:      ttl,
+		cache:    make(map[string]tierEntry),
+		now:      time.Now,
+	}
+}
+
+func (r *tierResolver) has(name string) bool {
+	_, ok := r.defaults[name]
+	return ok
+}
+
+func (r *tierResolver) names() []string {
+	out := make([]string, 0, len(r.defaults))
+	for k := range r.defaults {
+		out = append(out, k)
+	}
+	sort.Strings(out)
+	return out
+}
+
+// Resolve returns the current model spec and default reasoning level for a tier
+// name. ok=false if name is not a registered tier. Legacy reasoning suffixes
+// (":low/:medium/:high") are stripped per chain element; the first non-empty
+// level becomes the tier's default reasoning level (ollama tags like ":cloud"
+// pass through). The live value is read from config with the host-supplied
+// fallback; an empty resolved value yields ok=true with an empty spec
+// (ParseModelRequest surfaces a clear error in that path).
+func (r *tierResolver) Resolve(name string) (string, string, bool) {
+	if !r.has(name) {
+		return "", "", false
+	}
+	now := r.now()
+
+	r.mu.RLock()
+	if e, hit := r.cache[name]; hit && now.Before(e.expires) {
+		r.mu.RUnlock()
+		return e.spec, e.reasoning, true
+	}
+	r.mu.RUnlock()
+
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if e, hit := r.cache[name]; hit && now.Before(e.expires) {
+		return e.spec, e.reasoning, true
+	}
+
+	raw := strings.TrimSpace(config.String(r.cfg, tierConfigPrefix+name, r.defaults[name]))
+	spec, level := splitReasoningSpec(raw)
+	r.cache[name] = tierEntry{spec: spec, reasoning: level, expires: now.Add(r.ttl)}
+	return spec, level, true
+}
+
+// Reload clears the cache so the next Resolve fetches fresh from config.
+func (r *tierResolver) Reload() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.cache = make(map[string]tierEntry)
+}
+
+// --- package-level resolver + facade ---
+
+// defaultResolver is initialized as a package-level var (not in init()) so it
+// is ready before any other file's init runs — buildRegistry's delegating
+// resolver closure reads it at Resolve time. It starts with no tiers; a host
+// installs its tier table via Configure.
+var defaultResolver = NewTierResolver(nil, nil, 0)
+
+// Configure installs the host's tier table. cfg is the live config source
+// (nil = fallbacks only); defaults maps each tier name to its fallback spec;
+// ttl<=0 uses the config'd / 30s default. The package registry's delegating
+// resolver reads defaultResolver at Resolve time, so swapping it here is
+// sufficient — no registry rebuild needed.
+func Configure(cfg config.Source, defaults map[string]string, ttl time.Duration) {
+	defaultResolver = NewTierResolver(cfg, defaults, ttl)
+}
+
+// TierNames returns the registered tier alias names (sorted). Exported so UI
+// layers can populate tier dropdowns without hardcoding.
+func TierNames() []string { return defaultResolver.names() }
+
+// IsTierName reports whether s is a registered tier alias.
+func IsTierName(s string) bool { return defaultResolver.has(s) }
+
+// ReloadTiers clears the package resolver's cache so the next request resolves
+// freshly from config.
+func ReloadTiers() { defaultResolver.Reload() }
+
+// ValidateTierValue returns an error if value cannot be used as a tier spec —
+// specifically, when a chain entry is itself a tier name (which would form a
+// resolution loop). Chain entries must be concrete provider/model specs.
+func ValidateTierValue(value string) error {
+	for _, part := range strings.Split(value, ",") {
+		part = strings.TrimSpace(part)
+		if part == "" {
+			continue
+		}
+		spec, _ := splitReasoning(part)
+		if IsTierName(spec) {
+			return fmt.Errorf("tier value %q contains tier alias %q (chains must use concrete provider/model specs, not nested tiers)", value, spec)
+		}
+	}
+	return nil
+}
@@ -0,0 +1,110 @@
+// Package llms — wiring.go: the production boot hook that rebuilds the
+// package registry with the lane registry, the failover convars, and the
+// failover-event observer.
+//
+// Why a dedicated helper (vs spreading registry construction through
+// mort.go): the chatbot regression test in lane_chatbot_test.go and the
+// production boot path must call the SAME wiring code. Historically
+// mort.go skipped the lane wiring entirely (lanes were defined but never
+// installed — 30+ skill_runs in production with 0 skill_queue_jobs rows);
+// concentrating the install here means a regression in one wires fails
+// the test for the other.
+package model
+
+import (
+	"context"
+	"log/slog"
+	"time"
+
+	majordomo "gitea.stevedudenhoeffer.com/steve/majordomo"
+)
+
+// WireOptions configures Wire. The zero value rebuilds the registry with
+// no lanes and default failover behavior.
+type WireOptions struct {
+	// Lanes is the lane registry every provider is decorated with. nil
+	// disables lane queueing (calls pass straight through) but keeps
+	// error attribution for the failover log.
+	Lanes LaneRegistry
+
+	// FailoverMaxRetries maps the llms.failover.max_retries convar onto
+	// majordomo's ChainConfig.TransientRetries (same-target retries after
+	// a transient error). <= 0 keeps majordomo's default (1).
+	FailoverMaxRetries int
+
+	// FailoverCooldown maps the llms.failover.cooldown_seconds convar
+	// onto health.Config.BaseCooldown. majordomo grows the cooldown
+	// exponentially from this base per consecutive bench; the cap is
+	// max(FailoverCooldown, 5m) so the operator's dial dominates.
+	// <= 0 keeps the mort default (300s).
+	FailoverCooldown time.Duration
+
+	// FailoverObserver receives one event per failover decision (failed
+	// attempt, bench, benched-skip). Wire it to failoverlog.NewObserver.
+	// Attribution (caller/run/prompts) rides on the event's error — see
+	// CallInfoFromError.
+	FailoverObserver func(majordomo.FailoverEvent)
+}
+
+// Wire rebuilds the package registry from opts and installs it. Call once
+// at boot, after the lane registry and the failover convars exist (and
+// after Init for DB-backed tiers — though Init and Wire are order-
+// independent: the tier resolver is consulted through a delegating
+// indirection).
+//
+// Rebuilding discards in-memory health/bench state — Wire is a boot-time
+// operation, not a runtime toggle.
+//
+// When Lanes is non-nil, the well-known lanes (KnownLanes) are eagerly
+// registered so admin dashboards have baseline state from the moment mort
+// starts instead of "no lanes registered" until the first LLM call.
+//
+// Returns the installed registry for inspection (tests, health surfaces).
+func Wire(ctx context.Context, opts WireOptions) *majordomo.Registry {
+	r := buildRegistry(buildConfig{
+		lanes:      opts.Lanes,
+		maxRetries: opts.FailoverMaxRetries,
+		cooldown:   opts.FailoverCooldown,
+		observer:   opts.FailoverObserver,
+	})
+	setRegistry(r)
+
+	if opts.Lanes != nil {
+		names := KnownLanes()
+		for _, name := range names {
+			opts.Lanes.GetOrCreate(ctx, name)
+		}
+		slog.Info("llms: wired lane-aware registry", "lanes", len(names))
+	} else {
+		slog.Warn("llms: Wire called without a lane registry — lane queueing is inert")
+	}
+	return r
+}
+
+// KnownLanes returns the well-known lane names the LLM transport resolves
+// to. Eager-registering these at boot gives admin dashboards
+// (`/skills/admin/queues`, `.skill admin queue`) a baseline view from the
+// moment mort starts — without this, the dashboard reads "no lanes
+// registered" until the first chatbot/skill call materialises the lane
+// via lazy GetOrCreate.
+//
+// Why this list (and not "every lane name ever"): these are the ones
+// LaneFor in lane_mapping.go can produce for a real model spec. Future
+// non-LLM lanes (e.g. a future image-generation lane) should be eagerly
+// registered by their owning subsystem, not here.
+//
+// LaneSkillDefault is included even though it isn't an LLM-routing
+// lane: skills run through it via skillexec.WithLaneRegistry, and the
+// skills admin dashboard needs to see it from boot.
+//
+// Test: wiring_test.go::TestKnownLanes_NonEmpty.
+func KnownLanes() []string {
+	return []string{
+		LaneOllama,
+		LaneAnthropicThinking,
+		LaneAnthropicDefault,
+		LaneM1,
+		LaneLLMDefault,
+		"skill-default",
+	}
+}