diff --git a/CLAUDE.md b/CLAUDE.md index dcc609e..dfade99 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -48,11 +48,12 @@ CORE (majordomo + stdlib): dispatchguard/ loop/depth/fan-out caps [P0 ✓] pendingattach/ attachment dedupe [P0 ✓] tool/ registry + 3-stage permissions + ssrf [P1 ✓] - model/ config-driven tier resolution over majordomo [P1] - llmmeta/ shared meta-LLM helper (moves with model/) [P1] + model/ config-driven tier resolution over majordomo [P1 ✓] + (convar->config.Source; UsageSink/TraceSink seams; GenerateWith[T] + structured output — no separate structured/ pkg) + llmmeta/ shared meta-LLM helper over model/ [P1 ✓] compact/ context compactor (WithCompactor hook) [P2] tools/{web,net,store,compose,meta,comms} generic tools [P3] - structured/ Generate[T] convenience over majordomo [P1] BATTERIES (opt-in siblings, each nil-safe + a default): persona/ Agent noun + AgentStore seam + yml loader [P4] diff --git a/go.mod b/go.mod index c211aa8..c8ea9c0 100644 --- a/go.mod +++ b/go.mod @@ -4,5 +4,25 @@ go 1.26.2 require ( gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260626223738-1fd7109a42f3 + github.com/google/uuid v1.6.0 golang.org/x/crypto v0.53.0 ) + +require ( + cloud.google.com/go v0.116.0 // indirect + cloud.google.com/go/auth v0.9.3 // indirect + cloud.google.com/go/compute/metadata v0.5.0 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect + github.com/google/go-cmp v0.6.0 // indirect + github.com/google/s2a-go v0.1.8 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect + github.com/gorilla/websocket v1.5.3 // indirect + go.opencensus.io v0.24.0 // indirect + golang.org/x/net v0.55.0 // indirect + golang.org/x/sys v0.46.0 // indirect + golang.org/x/text v0.38.0 // indirect + google.golang.org/genai v1.59.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect + google.golang.org/grpc v1.66.2 // indirect + google.golang.org/protobuf v1.34.2 // indirect +) diff --git a/go.sum b/go.sum index 97cbe0a..22e9ecc 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,130 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.116.0 h1:B3fRrSDkLRt5qSHWe40ERJvhvnQwdZiHu0bJOpldweE= +cloud.google.com/go v0.116.0/go.mod h1:cEPSRWPzZEswwdr9BxE6ChEn01dWlTaF05LiC2Xs70U= +cloud.google.com/go/auth v0.9.3 h1:VOEUIAADkkLtyfr3BLa3R8Ed/j6w1jTBmARx+wb5w5U= +cloud.google.com/go/auth v0.9.3/go.mod h1:7z6VY+7h3KUdRov5F1i8NDP5ZzWKYmEPO842BgCsmTk= +cloud.google.com/go/compute/metadata v0.5.0 h1:Zr0eK8JbFv6+Wi4ilXAR8FJ3wyNdpxHKJNPos6LTZOY= +cloud.google.com/go/compute/metadata v0.5.0/go.mod h1:aHnloV2TPI38yx4s9+wAZhHykWvVCfu7hQbF+9CWoiY= gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260626223738-1fd7109a42f3 h1:KYKIFFRsXzbbBJVDa99+Fhy0zxl9G0xV/MCrLipsLL4= gitea.stevedudenhoeffer.com/steve/majordomo v0.0.0-20260626223738-1fd7109a42f3/go.mod h1:UZLveG17SmENt4sne2RSLIbioix30RZbRIQUzBAnOyY= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/s2a-go v0.1.8 h1:zZDs9gcbt9ZPLV0ndSyQk6Kacx2g/X+SKYovpnz3SMM= +github.com/google/s2a-go v0.1.8/go.mod h1:6iNWHTpQ+nfNRN5E00MSdfDwVesa8hhS32PhPO8deJA= +github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw= +github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA= +github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= +github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= +go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= +golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= +golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.46.0 h1:noSf2Fq6F8DBgS+LysIkx7rIExoNHJsxOAtPp4rthXw= +golang.org/x/sys v0.46.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE= +golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/genai v1.59.0 h1:xp+ydkJFW8hO0hTUaAkr8TrLM9HFP3NYAwFhPd0nDqA= +google.golang.org/genai v1.59.0/go.mod h1:mDdPDFXo1Ats7f1WXVyZgWb/CkMzFWTWJruIMy7hGIU= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= +google.golang.org/grpc v1.66.2 h1:3QdXkuq3Bkh7w+ywLdLvM56cmGvQHUMZpiCzt6Rqaoo= +google.golang.org/grpc v1.66.2/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/llmmeta/helper.go b/llmmeta/helper.go new file mode 100644 index 0000000..3fabdd0 --- /dev/null +++ b/llmmeta/helper.go @@ -0,0 +1,615 @@ +// Package llmmeta is the shared meta-LLM helper used by the v12 +// authoring tools (summarize, translate, extract_entities, classify). +// +// Why a dedicated package: each of those four tools makes "one fast-tier +// LLM call → typed result", with shared concerns (tier allowlist, +// ledger row, JSON-retry on malformed output). Centralising the pattern +// stops every tool from re-implementing the surrounding bookkeeping and +// keeps the audit trail uniform. +// +// The helper itself does NOT know about the four tools — it just exposes +// a Call(ctx, CallSpec) → CallResult shape. Each tool builds its own +// prompt + parses the typed result. The helper records the meta-call +// ledger row on every call, success or failure. +// +// Concurrency / lanes: the helper resolves the tier to an llm.Model via +// model.ParseModelForContext and uses model.Generate. Lane routing is +// already baked in at the LLM transport layer (see +// pkg/logic/llms/lane_transport.go) so each Generate call automatically +// goes through the right lane without further plumbing. Usage recording +// is automatic too: parsed models are instrumented by pkg/logic/llms, +// so the helper does NOT call model.RecordUsage itself. +// +// Tier allowlist: convar `skills.llm_meta.allowed_tiers` (default +// `["fast"]`) controls which tiers a meta-tool may use. A request for +// a disallowed tier returns error_kind="tier_not_allowed" WITHOUT +// making the call AND WITHOUT recording a ledger row (the call did +// not happen). +// +// Test: helper_test.go covers tier allowed, tier rejected, JSON +// retry path, malformed-twice path, and ledger-row emission semantics. +package llmmeta + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "github.com/google/uuid" + + "gitea.stevedudenhoeffer.com/steve/executus/model" +) + +// MetaCall is the domain row written to skill_llm_meta_calls on every +// helper call. +// +// Why a dedicated table (not skill_run_logs): per-skill token +// aggregation is cleaner with typed columns. Folding meta-calls into +// the generic event log would force a SUM-from-JSON path on every +// dashboard query. +// +// Why the field set is tight (no payload columns): the request bodies +// can be 32KB+. The agent's main run already captures system_prompt +// + user_message in the trace; storing them again here would double +// the audit footprint with no diagnostic value (the meta-call's +// inputs are derivable from the parent run's tool-call args). +type MetaCall struct { + ID string + RunID string + SkillID string + ToolName string + TierUsed string // "fast" / "standard" + ModelUsed string // resolved provider/model + InputTokens int + OutputTokens int + DurationMs int + Success bool + ErrorKind string // empty on success; one of the sentinel kinds otherwise + CreatedAt time.Time +} + +// Storage is the narrow surface the helper uses to persist meta-call +// ledger rows. Production wires a thin adapter around the skills GORM +// storage; tests substitute a fake. +// +// Why an interface (vs depending on pkg/logic/skills.Storage): the +// skills package imports skilltools (tool registry); having +// skilltools/llmmeta depend back on skills would form an import +// cycle. A narrow interface mirrored across the boundary is the +// project's standard cycle-break pattern (see KVStorage / FileStorage +// in pkg/skilltools/tools/). +type Storage interface { + RecordMetaCall(ctx context.Context, call MetaCall) error +} + +// ConvarReader is the narrow surface the helper uses to read +// `skills.llm_meta.allowed_tiers`. The convar package is database- +// backed; tests pass a static fake. +// +// Why an interface (vs reading convars directly): unit tests want to +// fake the allowlist without spinning up a convar manager. +type ConvarReader interface { + // AllowedTiers returns the list of tier names a meta-tool may use. + // Default ["fast"]. + AllowedTiers(ctx context.Context) []string +} + +// ConvarReaderFunc adapts a closure into a ConvarReader. Useful in +// production wiring (mort.go) where the underlying access is a +// single line of logic. +type ConvarReaderFunc func(ctx context.Context) []string + +// AllowedTiers satisfies ConvarReader. +func (f ConvarReaderFunc) AllowedTiers(ctx context.Context) []string { + if f == nil { + return []string{"fast"} + } + return f(ctx) +} + +// Helper makes one fast-tier LLM call with surrounding bookkeeping +// (tier allowlist, JSON retry, ledger row). +// +// Construct once at boot; all four meta-tools share the same Helper. +type Helper struct { + storage Storage + convars ConvarReader +} + +// New constructs a Helper. storage MUST be non-nil; passing nil makes +// every Call write a no-op ledger row (callers that need a fully no-op +// helper should instead avoid registering the tool). +// +// convars may be nil — the helper falls back to the default allowlist +// `["fast"]`. +// +// Why a constructor with explicit deps (vs Helper{...} struct +// initialiser): forces the deployment-time decision about which +// dependencies are wired vs nil-safe at the construction call site, +// not at the call site of each tool. +func New(storage Storage, convars ConvarReader) *Helper { + return &Helper{ + storage: storage, + convars: convars, + } +} + +// CallSpec is the per-call input. +// +// Why every field is explicit (vs builder pattern): the four meta-tools +// each populate the spec in one place; a struct literal at the call +// site is more readable than chained setters. +type CallSpec struct { + // Tier is the tier alias to use ("fast" / "standard"). Empty falls + // back to "fast". Disallowed tiers (per the convar allowlist) cause + // Call to return CallResult{Success: false, ErrorKind: + // "tier_not_allowed"} WITHOUT making the LLM call AND without + // writing a ledger row (the call did not happen). + Tier string + + // SystemPrompt is the system message. May be empty. + SystemPrompt string + + // UserPrompt is the user message. Required. + UserPrompt string + + // MaxOutputTokens caps the response. 0 disables the cap (provider + // default). The helper uses this both to bound the cost estimate + // AND to set llm.WithMaxTokens on the request. + MaxOutputTokens int + + // ResponseFormat is "text" or "json". When "json", the helper + // attempts to parse the response into JSON. Other values fall + // through as "text". + ResponseFormat string + + // RetryOnMalformedJSON, when true and ResponseFormat=="json", + // retries the call ONCE with a stricter JSON-only prompt prefix + // when the first response fails to parse. Second-failure returns + // CallResult{Success: true, Parsed: nil, ErrorKind: + // "malformed_json"} so callers can fall back to result.Text. + RetryOnMalformedJSON bool + + // ToolName is the meta-tool name recorded in the ledger row + // ("summarize", "translate", "extract_entities", "classify"). The + // helper does not branch on this value. + ToolName string + + // RunID is the calling skill run ID. Recorded in the ledger row; + // also used by the cost-cap callback to find the running 7-day + // total. + RunID string + + // SkillID is the calling skill ID. Recorded in the ledger row; + // passed to the cost-cap callback. + SkillID string + + // CallerID is the Discord member ID that triggered the parent + // skill run. Passed to the cost-cap callback so the per-user + // 7-day cap can be evaluated. + CallerID string +} + +// CallResult is the per-call output. +// +// Why text + parsed (vs only one): JSON-format calls expose both the +// raw response (in .Text) and the parsed map (in .Parsed). Text-format +// calls leave .Parsed nil. Callers requesting JSON that fails to parse +// twice get .Text populated and ErrorKind="malformed_json" so they +// can fall back to text-mode without an error path. +type CallResult struct { + // Text is the raw response text from the LLM. Populated on every + // successful call (success=true) AND when JSON parsing failed + // twice (success=true, parsed=nil, error_kind="malformed_json"). + // Empty on tier_not_allowed rejections (no LLM call happened). + Text string + + // Parsed is the JSON-decoded response. nil for text-format calls, + // nil for failed JSON parses, populated for successful JSON + // responses. The interior shape is whatever the LLM returned; the + // caller is responsible for asserting a typed view. + Parsed any + + // InputTokens is the tokens billed against the input. 0 when the + // provider didn't surface usage. + InputTokens int + + // OutputTokens is the tokens billed against the output. 0 when the + // provider didn't surface usage. + OutputTokens int + + // DurationMs is wall-clock duration of the LLM call (or call+retry + // in the JSON-retry case). + DurationMs int + + // ModelUsed is the resolved provider/model string ("anthropic/ + // claude-haiku-4-5-20251001"). Populated on every actual LLM call; + // empty on tier_not_allowed rejections. + ModelUsed string + + // Success reports whether the LLM call returned a usable response. + // True on happy-path AND on malformed-json second-failure (the + // caller can fall back to .Text). False on transport errors, + // tier_not_allowed, llm_unavailable. + Success bool + + // ErrorKind, when non-empty, is one of: + // - "tier_not_allowed" → no call, no ledger row + // - "llm_unavailable" → call attempted, ledger row written + // - "malformed_json" → call succeeded but JSON parse failed + ErrorKind string +} + +// Sentinel error_kind values for CallResult.ErrorKind. +const ( + ErrorKindTierNotAllowed = "tier_not_allowed" + ErrorKindLLMUnavailable = "llm_unavailable" + ErrorKindMalformedJSON = "malformed_json" +) + +// Call performs the meta-LLM call and returns a typed CallResult. +// +// Why no error return (vs an error second value): every meaningful +// failure is captured as a CallResult.ErrorKind so the caller's branch +// logic stays single-pathed. Internal transport errors are surfaced +// as ErrorKind=llm_unavailable. The function only returns a non-nil +// error for argument-validation failures (empty UserPrompt) — a +// programmer error the caller would have to fix anyway. +// +// Test: helper_test.go covers all outcomes (tier_not_allowed, happy +// text, happy json, malformed_json retry-pass, malformed_json +// retry-fail, llm_unavailable). +func (h *Helper) Call(ctx context.Context, spec CallSpec) (CallResult, error) { + if strings.TrimSpace(spec.UserPrompt) == "" { + return CallResult{}, fmt.Errorf("llmmeta: user_prompt required") + } + tier := strings.TrimSpace(spec.Tier) + if tier == "" { + tier = "fast" + } + + // Tier allowlist: rejected tiers do NOT make the call AND do NOT + // record a ledger row. + if !h.tierAllowed(ctx, tier) { + return CallResult{ + Success: false, + ErrorKind: ErrorKindTierNotAllowed, + }, nil + } + + resolvedModel := model.ResolveModelName(tier) + + // Resolve model. ParseModelForContext attaches the resolved model + // name to ctx (for usage attribution) AND returns the llm.Model + // whose Generate already routes through the lane wrapper. + ctx, model, err := model.ParseModelForContext(ctx, tier) + if err != nil { + // Tier convar mis-set: surface as tier_not_allowed to the + // caller (the agent's recovery path is the same as for an + // admin-disabled tier) but DO record the failure for the + // admin who needs to fix the convar. + h.recordLedger(ctx, MetaCall{ + ID: uuid.NewString(), + RunID: spec.RunID, + SkillID: spec.SkillID, + ToolName: spec.ToolName, + TierUsed: tier, + ModelUsed: resolvedModel, + Success: false, + ErrorKind: ErrorKindTierNotAllowed, + CreatedAt: time.Now(), + }) + return CallResult{ + Success: false, + ErrorKind: ErrorKindTierNotAllowed, + }, nil + } + + // First call. + start := time.Now() + systemPrompt := spec.SystemPrompt + userMessage := spec.UserPrompt + opts := []llm.Option{} + if spec.MaxOutputTokens > 0 { + opts = append(opts, llm.WithMaxTokens(spec.MaxOutputTokens)) + } + text, usage, llmErr := h.complete(ctx, model, systemPrompt, userMessage, opts) + if llmErr != nil { + duration := int(time.Since(start) / time.Millisecond) + h.recordLedger(ctx, MetaCall{ + ID: uuid.NewString(), + RunID: spec.RunID, + SkillID: spec.SkillID, + ToolName: spec.ToolName, + TierUsed: tier, + ModelUsed: resolvedModel, + InputTokens: usage.InputTokens, + OutputTokens: usage.OutputTokens, + DurationMs: duration, + Success: false, + ErrorKind: ErrorKindLLMUnavailable, + CreatedAt: time.Now(), + }) + return CallResult{ + Success: false, + ErrorKind: ErrorKindLLMUnavailable, + ModelUsed: resolvedModel, + DurationMs: duration, + InputTokens: usage.InputTokens, + OutputTokens: usage.OutputTokens, + }, nil + } + + // Determine outcome based on response format. + parsed, parsedOK := tryParseJSON(text, spec.ResponseFormat) + wantJSON := strings.EqualFold(spec.ResponseFormat, "json") + + if !wantJSON || parsedOK { + // Happy path (text mode OR JSON mode that parsed first try). + duration := int(time.Since(start) / time.Millisecond) + h.recordLedger(ctx, MetaCall{ + ID: uuid.NewString(), + RunID: spec.RunID, + SkillID: spec.SkillID, + ToolName: spec.ToolName, + TierUsed: tier, + ModelUsed: resolvedModel, + InputTokens: usage.InputTokens, + OutputTokens: usage.OutputTokens, + DurationMs: duration, + Success: true, + CreatedAt: time.Now(), + }) + return CallResult{ + Text: text, + Parsed: parsed, + Success: true, + ModelUsed: resolvedModel, + InputTokens: usage.InputTokens, + OutputTokens: usage.OutputTokens, + DurationMs: duration, + }, nil + } + + // JSON requested but first response failed to parse. + if !spec.RetryOnMalformedJSON { + duration := int(time.Since(start) / time.Millisecond) + h.recordLedger(ctx, MetaCall{ + ID: uuid.NewString(), + RunID: spec.RunID, + SkillID: spec.SkillID, + ToolName: spec.ToolName, + TierUsed: tier, + ModelUsed: resolvedModel, + InputTokens: usage.InputTokens, + OutputTokens: usage.OutputTokens, + DurationMs: duration, + Success: true, + ErrorKind: ErrorKindMalformedJSON, + CreatedAt: time.Now(), + }) + return CallResult{ + Text: text, + Success: true, + ErrorKind: ErrorKindMalformedJSON, + ModelUsed: resolvedModel, + InputTokens: usage.InputTokens, + OutputTokens: usage.OutputTokens, + DurationMs: duration, + }, nil + } + + // Retry once with stricter JSON-only prompt prefix. + stricterPrompt := "Return ONLY valid JSON. No prose, no markdown fencing.\n\n" + userMessage + text2, usage2, llmErr2 := h.complete(ctx, model, systemPrompt, stricterPrompt, opts) + combinedUsage := Tokens{ + InputTokens: usage.InputTokens + usage2.InputTokens, + OutputTokens: usage.OutputTokens + usage2.OutputTokens, + } + duration := int(time.Since(start) / time.Millisecond) + if llmErr2 != nil { + // Retry call itself failed transport-wise. Record the round- + // trip tokens and surface llm_unavailable. + h.recordLedger(ctx, MetaCall{ + ID: uuid.NewString(), + RunID: spec.RunID, + SkillID: spec.SkillID, + ToolName: spec.ToolName, + TierUsed: tier, + ModelUsed: resolvedModel, + InputTokens: combinedUsage.InputTokens, + OutputTokens: combinedUsage.OutputTokens, + DurationMs: duration, + Success: false, + ErrorKind: ErrorKindLLMUnavailable, + CreatedAt: time.Now(), + }) + return CallResult{ + Text: text, + Success: false, + ErrorKind: ErrorKindLLMUnavailable, + ModelUsed: resolvedModel, + InputTokens: combinedUsage.InputTokens, + OutputTokens: combinedUsage.OutputTokens, + DurationMs: duration, + }, nil + } + + parsed2, parsedOK2 := tryParseJSON(text2, "json") + if parsedOK2 { + h.recordLedger(ctx, MetaCall{ + ID: uuid.NewString(), + RunID: spec.RunID, + SkillID: spec.SkillID, + ToolName: spec.ToolName, + TierUsed: tier, + ModelUsed: resolvedModel, + InputTokens: combinedUsage.InputTokens, + OutputTokens: combinedUsage.OutputTokens, + DurationMs: duration, + Success: true, + CreatedAt: time.Now(), + }) + return CallResult{ + Text: text2, + Parsed: parsed2, + Success: true, + ModelUsed: resolvedModel, + InputTokens: combinedUsage.InputTokens, + OutputTokens: combinedUsage.OutputTokens, + DurationMs: duration, + }, nil + } + + // Second-failure path. Caller can fall back to result.Text. + h.recordLedger(ctx, MetaCall{ + ID: uuid.NewString(), + RunID: spec.RunID, + SkillID: spec.SkillID, + ToolName: spec.ToolName, + TierUsed: tier, + ModelUsed: resolvedModel, + InputTokens: combinedUsage.InputTokens, + OutputTokens: combinedUsage.OutputTokens, + DurationMs: duration, + Success: true, + ErrorKind: ErrorKindMalformedJSON, + CreatedAt: time.Now(), + }) + return CallResult{ + Text: text2, + Success: true, + ErrorKind: ErrorKindMalformedJSON, + ModelUsed: resolvedModel, + InputTokens: combinedUsage.InputTokens, + OutputTokens: combinedUsage.OutputTokens, + DurationMs: duration, + }, nil +} + +// Tokens is the input/output token count returned by the LLM round- +// trip. Mirrors llm.Usage's two cost-bearing fields. Exported so +// downstream test code (the four meta-tools' tests, integration +// tests) can use SetCompleteForTest. +type Tokens struct { + InputTokens int + OutputTokens int +} + +// CompleteFn is the seam used by tests to fake the LLM round-trip +// without spinning up a real provider. Exported for tests in other +// packages (the four meta-tools live in pkg/skilltools/tools/). +type CompleteFn func(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error) + +// completeOverride is set in tests via SetCompleteForTest. nil falls +// back to the real model.Generate path. +var completeOverride CompleteFn + +// complete is the actual LLM round-trip. Calls model.Generate (which +// already routes through the lane transport wrapper) and returns the +// text + usage + error. +// +// Why not call model.SimpleCall: SimpleCall doesn't surface Usage; we +// need the input/output token counts for the ledger row. +// +// Usage attribution to the per-user / per-skill dashboards is handled +// by the instrumented model that model.ParseModelForContext returns — +// a manual model.RecordUsage here would double-count. +func (h *Helper) complete(ctx context.Context, model llm.Model, systemPrompt, userMessage string, opts []llm.Option) (string, Tokens, error) { + if completeOverride != nil { + return completeOverride(ctx, model, systemPrompt, userMessage, opts) + } + req := llm.Request{ + System: systemPrompt, + Messages: []llm.Message{llm.UserText(userMessage)}, + } + resp, err := model.Generate(ctx, req, opts...) + if err != nil { + return "", Tokens{}, err + } + usage := Tokens{ + InputTokens: resp.Usage.InputTokens, + OutputTokens: resp.Usage.OutputTokens, + } + return resp.Text(), usage, nil +} + +// SetCompleteForTest installs a fake completer used by Call. Returns a +// restore function that the test deferes to revert the override. +// +// Why exported (vs in a _test.go file): the four meta-tools' tests live +// in pkg/skilltools/tools/, in a different package than the helper. +// They need a way to fake the LLM without depending on a real model. +func SetCompleteForTest(fn CompleteFn) func() { + prev := completeOverride + completeOverride = fn + return func() { completeOverride = prev } +} + +// tierAllowed reports whether the given tier appears in the configured +// allowlist. Empty allowlist defaults to ["fast"]. +func (h *Helper) tierAllowed(ctx context.Context, tier string) bool { + var allowed []string + if h.convars != nil { + allowed = h.convars.AllowedTiers(ctx) + } + if len(allowed) == 0 { + allowed = []string{"fast"} + } + for _, t := range allowed { + if strings.EqualFold(strings.TrimSpace(t), tier) { + return true + } + } + return false +} + +// recordLedger writes one meta-call row. Storage failures are logged +// at the storage layer; the helper does not propagate them — meta-call +// accounting MUST NOT break user-visible execution. +func (h *Helper) recordLedger(ctx context.Context, call MetaCall) { + if h.storage == nil { + return + } + _ = h.storage.RecordMetaCall(ctx, call) +} + +// tryParseJSON attempts to decode text as JSON. Returns the parsed +// value (any) and ok=true on success. ok=false on failure or when +// format is not "json". +// +// Why we accept arbitrary JSON shapes (vs requiring an object): the +// extract_entities tool returns objects, but classify returns objects +// with arrays inside. Accepting `any` keeps the helper agnostic to the +// caller's downstream typing. +// +// Tolerance: strips a leading "```json" code fence + matching closing +// fence so the agent can include surrounding markdown without +// breaking parse. The stricter retry prompt explicitly asks for no +// fence; this tolerance is for the first-attempt path. +func tryParseJSON(text, format string) (any, bool) { + if !strings.EqualFold(format, "json") { + return nil, false + } + trimmed := strings.TrimSpace(text) + // Strip optional ```json ... ``` fence. + if strings.HasPrefix(trimmed, "```") { + // Drop opening fence (with or without language tag). + if idx := strings.Index(trimmed, "\n"); idx >= 0 { + trimmed = trimmed[idx+1:] + } + // Drop trailing fence. + if idx := strings.LastIndex(trimmed, "```"); idx >= 0 { + trimmed = trimmed[:idx] + } + trimmed = strings.TrimSpace(trimmed) + } + var parsed any + if err := json.Unmarshal([]byte(trimmed), &parsed); err != nil { + return nil, false + } + return parsed, true +} diff --git a/llmmeta/helper_test.go b/llmmeta/helper_test.go new file mode 100644 index 0000000..acad16f --- /dev/null +++ b/llmmeta/helper_test.go @@ -0,0 +1,282 @@ +package llmmeta + +import ( + "context" + "errors" + "strings" + "sync" + "testing" + + llm "gitea.stevedudenhoeffer.com/steve/majordomo/llm" +) + +// fakeStorage records every MetaCall handed to RecordMetaCall and +// makes them available to tests via the captured slice. +type fakeStorage struct { + mu sync.Mutex + calls []MetaCall + err error +} + +func (f *fakeStorage) RecordMetaCall(_ context.Context, call MetaCall) error { + f.mu.Lock() + defer f.mu.Unlock() + f.calls = append(f.calls, call) + return f.err +} + +func (f *fakeStorage) snapshot() []MetaCall { + f.mu.Lock() + defer f.mu.Unlock() + out := make([]MetaCall, len(f.calls)) + copy(out, f.calls) + return out +} + +// TestCall_TierNotAllowed: a tier not in the allowlist returns the +// rejection without recording a ledger row — the call did not happen. +func TestCall_TierNotAllowed(t *testing.T) { + store := &fakeStorage{} + convars := ConvarReaderFunc(func(_ context.Context) []string { + return []string{"fast"} + }) + h := New(store, convars) + + res, err := h.Call(context.Background(), CallSpec{ + Tier: "thinking", + UserPrompt: "hello", + ToolName: "summarize", + }) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if res.Success { + t.Errorf("expected Success=false") + } + if res.ErrorKind != ErrorKindTierNotAllowed { + t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindTierNotAllowed) + } + if len(store.snapshot()) != 0 { + t.Errorf("expected NO ledger row for tier_not_allowed, got %d", len(store.snapshot())) + } +} + +// TestCall_TierAllowedHappyText: a permitted tier yields a successful +// text call AND records a ledger row. +func TestCall_TierAllowedHappyText(t *testing.T) { + store := &fakeStorage{} + convars := ConvarReaderFunc(func(_ context.Context) []string { + return []string{"fast"} + }) + h := New(store, convars) + restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) { + return "summary text here", Tokens{InputTokens: 50, OutputTokens: 12}, nil + }) + defer restore() + + res, err := h.Call(context.Background(), CallSpec{ + Tier: "fast", + UserPrompt: "summarise the following ...", + ToolName: "summarize", + ResponseFormat: "text", + RunID: "run-1", + SkillID: "sk-1", + }) + if err != nil { + t.Fatalf("unexpected err: %v", err) + } + if !res.Success { + t.Errorf("expected Success=true; got ErrorKind=%q", res.ErrorKind) + } + if res.Text != "summary text here" { + t.Errorf("Text = %q, want %q", res.Text, "summary text here") + } + if res.InputTokens != 50 || res.OutputTokens != 12 { + t.Errorf("token counts wrong: in=%d out=%d", res.InputTokens, res.OutputTokens) + } + if got := len(store.snapshot()); got != 1 { + t.Fatalf("expected 1 ledger row, got %d", got) + } + row := store.snapshot()[0] + if !row.Success { + t.Errorf("ledger Success = false, want true") + } + if row.ToolName != "summarize" { + t.Errorf("ledger ToolName = %q", row.ToolName) + } + if row.RunID != "run-1" { + t.Errorf("ledger RunID = %q", row.RunID) + } + if row.InputTokens != 50 || row.OutputTokens != 12 { + t.Errorf("ledger token counts wrong: in=%d out=%d", + row.InputTokens, row.OutputTokens) + } +} + +// TestCall_JSONFirstAttemptParses: JSON-format request, response is +// valid JSON on first try; result.Parsed populated. +func TestCall_JSONFirstAttemptParses(t *testing.T) { + store := &fakeStorage{} + h := New(store, nil) + restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) { + return `{"foo":"bar","n":42}`, Tokens{InputTokens: 10, OutputTokens: 5}, nil + }) + defer restore() + + res, _ := h.Call(context.Background(), CallSpec{ + UserPrompt: "extract entities", + ToolName: "extract_entities", + ResponseFormat: "json", + RetryOnMalformedJSON: true, + SkillID: "sk-2", + }) + if !res.Success || res.ErrorKind != "" { + t.Fatalf("expected success, got %+v", res) + } + m, ok := res.Parsed.(map[string]any) + if !ok { + t.Fatalf("Parsed not a map: %T %v", res.Parsed, res.Parsed) + } + if m["foo"] != "bar" { + t.Errorf("Parsed[foo] = %v", m["foo"]) + } +} + +// TestCall_JSONRetryPath: first response is malformed JSON; second +// response (after stricter prompt) parses cleanly. +func TestCall_JSONRetryPath(t *testing.T) { + store := &fakeStorage{} + h := New(store, nil) + calls := 0 + restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, prompt string, _ []llm.Option) (string, Tokens, error) { + calls++ + if calls == 1 { + return "Here is your JSON: {oh no I forgot to format it", Tokens{InputTokens: 8, OutputTokens: 12}, nil + } + // Verify stricter prompt prefix appeared on retry. + if !strings.Contains(prompt, "Return ONLY valid JSON") { + t.Errorf("retry prompt missing stricter prefix: %q", prompt) + } + return `{"key":"value"}`, Tokens{InputTokens: 14, OutputTokens: 6}, nil + }) + defer restore() + + res, _ := h.Call(context.Background(), CallSpec{ + UserPrompt: "extract", + ToolName: "extract_entities", + ResponseFormat: "json", + RetryOnMalformedJSON: true, + }) + if !res.Success || res.ErrorKind != "" { + t.Fatalf("expected success, got %+v", res) + } + if calls != 2 { + t.Errorf("expected 2 LLM calls, got %d", calls) + } + m, _ := res.Parsed.(map[string]any) + if m["key"] != "value" { + t.Errorf("Parsed = %v", res.Parsed) + } + // Token counts should reflect both attempts. + if res.InputTokens != 22 || res.OutputTokens != 18 { + t.Errorf("combined tokens wrong: in=%d out=%d", res.InputTokens, res.OutputTokens) + } +} + +// TestCall_JSONRetryFailsTwice: second attempt also fails to parse. +// Surfaces ErrorKind=malformed_json AND keeps Success=true so the +// caller can fall back to result.Text. +func TestCall_JSONRetryFailsTwice(t *testing.T) { + store := &fakeStorage{} + h := New(store, nil) + restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) { + return "still not JSON", Tokens{InputTokens: 10, OutputTokens: 4}, nil + }) + defer restore() + + res, _ := h.Call(context.Background(), CallSpec{ + UserPrompt: "extract", + ToolName: "extract_entities", + ResponseFormat: "json", + RetryOnMalformedJSON: true, + }) + if !res.Success { + t.Errorf("expected Success=true (fall-back-to-text), got Success=false") + } + if res.ErrorKind != ErrorKindMalformedJSON { + t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindMalformedJSON) + } + if res.Parsed != nil { + t.Errorf("Parsed = %v, want nil after failed retry", res.Parsed) + } + rows := store.snapshot() + if len(rows) != 1 { + t.Fatalf("expected 1 ledger row, got %d", len(rows)) + } + if !rows[0].Success || rows[0].ErrorKind != ErrorKindMalformedJSON { + t.Errorf("ledger row mismatch: %+v", rows[0]) + } +} + +// TestCall_LLMUnavailable: transport error from the model.Generate +// call is surfaced as ErrorKind=llm_unavailable AND records a ledger +// row. +func TestCall_LLMUnavailable(t *testing.T) { + store := &fakeStorage{} + h := New(store, nil) + restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) { + return "", Tokens{}, errors.New("network error") + }) + defer restore() + + res, _ := h.Call(context.Background(), CallSpec{ + UserPrompt: "hi", + ToolName: "summarize", + }) + if res.Success { + t.Errorf("expected Success=false") + } + if res.ErrorKind != ErrorKindLLMUnavailable { + t.Errorf("ErrorKind = %q, want %q", res.ErrorKind, ErrorKindLLMUnavailable) + } + rows := store.snapshot() + if len(rows) != 1 { + t.Fatalf("expected 1 ledger row, got %d", len(rows)) + } +} + +// TestCall_EmptyUserPromptErrors: programmer-error guard. +func TestCall_EmptyUserPromptErrors(t *testing.T) { + h := New(&fakeStorage{}, nil) + _, err := h.Call(context.Background(), CallSpec{ToolName: "summarize"}) + if err == nil { + t.Fatal("expected error for empty user_prompt") + } +} + +// TestCall_JSONWithCodeFenceParses: tolerance for the first-attempt +// response wrapped in a ```json ... ``` fence. The retry path uses a +// stricter prompt; this test pins the first-attempt tolerance so +// callers don't waste a round-trip on a benign formatting wrapper. +func TestCall_JSONWithCodeFenceParses(t *testing.T) { + store := &fakeStorage{} + h := New(store, nil) + restore := SetCompleteForTest(func(_ context.Context, _ llm.Model, _, _ string, _ []llm.Option) (string, Tokens, error) { + return "```json\n{\"x\":1}\n```", Tokens{InputTokens: 5, OutputTokens: 4}, nil + }) + defer restore() + + res, _ := h.Call(context.Background(), CallSpec{ + UserPrompt: "extract", + ToolName: "extract_entities", + ResponseFormat: "json", + RetryOnMalformedJSON: true, + }) + if res.ErrorKind != "" { + t.Errorf("unexpected ErrorKind %q (fenced JSON should parse on first attempt)", res.ErrorKind) + } + m, _ := res.Parsed.(map[string]any) + if m["x"] != float64(1) { + t.Errorf("Parsed[x] = %v, want 1", m["x"]) + } +} diff --git a/llmmeta/main_test.go b/llmmeta/main_test.go new file mode 100644 index 0000000..2cb7eda --- /dev/null +++ b/llmmeta/main_test.go @@ -0,0 +1,21 @@ +package llmmeta + +import ( + "os" + "testing" + "time" + + "gitea.stevedudenhoeffer.com/steve/executus/model" +) + +// TestMain configures a minimal model tier table so the helper's +// model.ParseModelForContext("fast"/"standard") resolves. The actual LLM call +// is stubbed per-test via SetCompleteForTest, so these specs are only parsed +// (anthropic registers with an empty key and errors at call time, not parse). +func TestMain(m *testing.M) { + model.Configure(nil, map[string]string{ + "fast": "anthropic/claude-haiku-4-5", + "standard": "anthropic/claude-sonnet-4-6", + }, time.Minute) + os.Exit(m.Run()) +} diff --git a/model/bench.go b/model/bench.go new file mode 100644 index 0000000..c88c329 --- /dev/null +++ b/model/bench.go @@ -0,0 +1,97 @@ +// Package llms — bench.go: the mort-flavored facade over majordomo's +// health tracker for the `.failover` Discord commands and the failover +// web UI. +// +// Why a facade (vs exposing health.Tracker directly): the admin surfaces +// want the historical shape — a benched-only list with a manual/auto +// flag. majordomo's tracker treats manual benches (Bench) and automatic +// backoffs identically, so the manual marker is kept mort-side. +package model + +import ( + "sync" + "time" +) + +// BenchedModel is one currently-benched model for admin display. +type BenchedModel struct { + // Model is the "provider/model" target key. + Model string + // Until is the end of the bench window. + Until time.Time + // ConsecutiveFails is the failure count since the last success. + ConsecutiveFails int + // Manual reports the bench was placed by an operator (BenchModel) + // rather than the automatic failure threshold. + Manual bool +} + +var ( + manualMu sync.Mutex + manualBenches = map[string]time.Time{} +) + +// ListBenched returns the currently-benched models, manual and automatic, +// from the live health tracker. +func ListBenched() []BenchedModel { + now := time.Now() + pruneManual(now) + + var out []BenchedModel + for _, st := range Health().Snapshot() { + if !st.Until.After(now) { + continue + } + out = append(out, BenchedModel{ + Model: st.Key, + Until: st.Until, + ConsecutiveFails: st.ConsecutiveFailures, + Manual: isManual(st.Key, st.Until), + }) + } + return out +} + +// BenchModel manually benches a model spec until the given time. The +// chain executor skips benched targets until the window expires (or +// UnbenchModel clears it). +func BenchModel(model string, until time.Time) { + Health().Bench(model, until) + manualMu.Lock() + manualBenches[model] = until + manualMu.Unlock() +} + +// UnbenchModel clears the bench on a model. Returns true when the model +// was actually benched. +func UnbenchModel(model string) bool { + now := time.Now() + wasBenched := Health().BackedOffUntil(model).After(now) + Health().Unbench(model) + manualMu.Lock() + delete(manualBenches, model) + manualMu.Unlock() + return wasBenched +} + +// isManual reports whether the bench window for key matches a manual +// bench placed via BenchModel. An automatic backoff that outlives the +// manual window supersedes the marker. +func isManual(key string, until time.Time) bool { + manualMu.Lock() + defer manualMu.Unlock() + manualUntil, ok := manualBenches[key] + return ok && !until.After(manualUntil) +} + +// pruneManual drops expired manual markers so the map can't grow +// unbounded across a long uptime. +func pruneManual(now time.Time) { + manualMu.Lock() + defer manualMu.Unlock() + for k, until := range manualBenches { + if !until.After(now) { + delete(manualBenches, k) + } + } +} diff --git a/model/call.go b/model/call.go new file mode 100644 index 0000000..30bb964 --- /dev/null +++ b/model/call.go @@ -0,0 +1,415 @@ +package model + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "log/slog" + "runtime/debug" + "strings" + "time" + + majordomo "gitea.stevedudenhoeffer.com/steve/majordomo" + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "github.com/google/uuid" +) + +// CallResult captures the result of a single tool call execution. +type CallResult struct { + Name string + Arguments string + Result string + Error error +} + +// instrumentedModel decorates a parsed model so every successful Generate +// records token usage to the usage sink automatically. This is the +// single usage chokepoint: ANY call through a model from +// ParseModelRequest / ParseModelForContext is accounted, whether it goes +// through the helpers in this file, the agent loop, or a direct +// model.Generate at a call site. +// +// IMPORTANT: do not call RecordUsage on responses from a parsed model — +// that would double-count. RecordUsage exists for models obtained outside +// this package. +type instrumentedModel struct { + inner llm.Model +} + +func (m *instrumentedModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) { + resp, err := m.inner.Generate(ctx, req, opts...) + if err == nil && resp != nil { + recordUsage(ctx, resp) + } + return resp, err +} + +func (m *instrumentedModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) { + return m.inner.Stream(ctx, req, opts...) +} + +func (m *instrumentedModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() } + +// CallAndExecute sends messages to the model with a toolbox, executes any +// tool calls, and returns the results. It performs a single round of +// generation + tool execution (no looping) — multi-step loops belong to +// the agent package. +func CallAndExecute(ctx context.Context, model llm.Model, systemPrompt string, toolbox *llm.Toolbox, messages []llm.Message, opts ...llm.Option) ([]CallResult, string, error) { + req := llm.Request{System: systemPrompt, Messages: messages} + + allOpts := make([]llm.Option, 0, len(opts)+1) + if toolbox != nil { + allOpts = append(allOpts, llm.WithToolbox(toolbox)) + } + allOpts = append(allOpts, opts...) + + startTime := time.Now() + resp, err := model.Generate(ctx, req, allOpts...) + if err != nil { + recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, nil, nil, startTime, err) + return nil, "", fmt.Errorf("completion failed: %w", err) + } + + if len(resp.ToolCalls) == 0 || toolbox == nil { + recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, resp, nil, startTime, nil) + return nil, resp.Text(), nil + } + + var results []CallResult + for _, call := range resp.ToolCalls { + tr := toolbox.Execute(ctx, call) + cr := CallResult{ + Name: call.Name, + Arguments: string(call.Arguments), + Result: tr.Content, + } + if tr.IsError { + cr.Error = errors.New(tr.Content) + } + results = append(results, cr) + } + + recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, resp, results, startTime, nil) + + return results, resp.Text(), nil +} + +// GenerateWith sends messages to the model with an optional system prompt and +// returns structured output parsed into T. T must be a struct. Uses +// majordomo's native structured output (response schema derived from T). +func GenerateWith[T any](ctx context.Context, model llm.Model, systemPrompt string, messages []llm.Message, opts ...llm.Option) (T, error) { + req := llm.Request{System: systemPrompt, Messages: messages} + + startTime := time.Now() + + // Capture the raw response so the trace span carries usage and the + // concrete serving model even though majordomo.Generate only returns T. + capture := &captureModel{inner: model} + result, err := majordomo.Generate[T](ctx, capture, req, opts...) + + resolvedModel := resolvedModelName(ctx, capture.resp) + + if tracingEnabled(ctx) { + span := Span{ + SpanID: uuid.New().String(), + TraceID: traceIDFromContext(ctx), + Model: resolvedModel, + SystemPrompt: systemPrompt, + Messages: marshalMessages(messages), + DurationMs: time.Since(startTime).Milliseconds(), + StartedAt: startTime, + CompletedAt: time.Now(), + CreatedAt: time.Now(), + } + if capture.resp != nil { + span.InputTokens = capture.resp.Usage.InputTokens + span.OutputTokens = capture.resp.Usage.OutputTokens + } + if err != nil { + span.Error = err.Error() + // Structured-output failure: log loudly so operators can chase + // down a regression (e.g. a model returning prose or fenced + // JSON the decoder rejects) from the trace span alone. The + // error string includes the failing field path on decode + // errors. + if isStructuredOutputParseError(err) { + slog.Warn("llms.GenerateWith: structured-output parse failure", + "model", resolvedModel, + "span_id", span.SpanID, + "trace_id", span.TraceID, + "err", err.Error(), + ) + } + } else { + b, _ := json.Marshal(result) + span.ResponseText = string(b) + } + traceSink.WriteSpan(span) + } else if err != nil && isStructuredOutputParseError(err) { + // Tracing disabled: slog.Warn is the only breadcrumb operators get. + slog.Warn("llms.GenerateWith: structured-output parse failure (no trace span)", + "model", resolvedModel, + "err", err.Error(), + ) + } + + return result, err +} + +// captureModel records the last successful response so wrappers that +// only see the decoded result (majordomo.Generate) can still attribute +// usage and tracing. +type captureModel struct { + inner llm.Model + resp *llm.Response +} + +func (m *captureModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) { + resp, err := m.inner.Generate(ctx, req, opts...) + if err == nil { + m.resp = resp + } + return resp, err +} + +func (m *captureModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) { + return m.inner.Stream(ctx, req, opts...) +} + +func (m *captureModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() } + +// isStructuredOutputParseError reports whether err looks like a +// structured-output failure from majordomo.Generate — either the decode +// path ("decode structured response") or the empty-response path +// ("structured response from ... is empty"). Used to gate the loud +// slog.Warn so transport errors don't get tagged as parse failures. +func isStructuredOutputParseError(err error) bool { + if err == nil { + return false + } + s := err.Error() + return strings.Contains(s, "decode structured response") || + strings.Contains(s, "structured response from") +} + +// SimpleCall sends a single user message to the model with an optional system +// prompt and returns the text response. No tools involved. +func SimpleCall(ctx context.Context, model llm.Model, systemPrompt string, userMessage string, opts ...llm.Option) (string, error) { + msgs := []llm.Message{llm.UserText(userMessage)} + + startTime := time.Now() + resp, err := model.Generate(ctx, llm.Request{System: systemPrompt, Messages: msgs}, opts...) + if err != nil { + recordSpanFromWrapper(ctx, systemPrompt, msgs, nil, nil, nil, startTime, err) + return "", fmt.Errorf("completion failed: %w", err) + } + + recordSpanFromWrapper(ctx, systemPrompt, msgs, nil, resp, nil, startTime, nil) + + return resp.Text(), nil +} + +// RecordUsage records LLM token usage from a successful Generate response. +// +// ONLY call this for models obtained outside this package: models returned +// by ParseModelRequest / ParseModelForContext record usage automatically on +// every Generate, and calling RecordUsage on their responses double-counts. +func RecordUsage(ctx context.Context, resp llm.Response) { + recordUsage(ctx, &resp) +} + +// RecordSpan records a trace span for a direct model.Generate() call. +// Call this from modules that invoke model.Generate() directly when they +// want the call traced (usage is already recorded automatically for +// parsed models). +func RecordSpan(ctx context.Context, systemPrompt string, messages []llm.Message, toolbox *llm.Toolbox, resp *llm.Response, callResults []CallResult, startTime time.Time, callErr error) { + recordSpanFromWrapper(ctx, systemPrompt, messages, toolbox, resp, callResults, startTime, callErr) +} + +// recordUsage records token usage for one response. The model is +// attributed from the response itself when possible (resp.Model names +// the chain element that actually served the request — more precise than +// the requested spec), falling back to the context attribution set by +// ParseModelForContext. +func recordUsage(ctx context.Context, resp *llm.Response) { + if usageSink == nil || resp == nil { + return + } + u := resp.Usage + if u.InputTokens == 0 && u.OutputTokens == 0 { + return + } + model := resolvedModelName(ctx, resp) + if model == "unknown" || model == "" { + tool := toolFromContext(ctx) + if tool == "unknown" { + slog.Warn("model usage: recording with both unknown model and tool", + "user", userFromContext(ctx), "stack", string(debug.Stack())) + } else { + slog.Warn("model usage: recording with unknown model — caller should set model.WithModel or use model.ParseModelForContext", + "tool", tool, "user", userFromContext(ctx)) + } + } + usageSink.Record(ctx, model, u.InputTokens, u.OutputTokens, u.CacheReadTokens, u.CacheWriteTokens) +} + +// resolvedModelName picks the usage/trace attribution name: the serving +// model from the response when present ("provider/model" → "model"), +// else the context's requested model resolved through the tier table. +func resolvedModelName(ctx context.Context, resp *llm.Response) string { + if resp != nil && resp.Model != "" { + name := resp.Model + if idx := strings.Index(name, "/"); idx >= 0 { + name = name[idx+1:] + } + return name + } + return ResolveModelName(modelFromContext(ctx)) +} + +// tracingEnabled returns true if there's an active trace and tracing is enabled. +func tracingEnabled(ctx context.Context) bool { + if traceSink == nil { + return false + } + return traceIDFromContext(ctx) != "" +} + +// recordSpanFromWrapper records a trace span if tracing is active. +func recordSpanFromWrapper(ctx context.Context, systemPrompt string, messages []llm.Message, toolbox *llm.Toolbox, resp *llm.Response, callResults []CallResult, startTime time.Time, callErr error) { + if !tracingEnabled(ctx) { + return + } + + now := time.Now() + + span := Span{ + SpanID: uuid.New().String(), + TraceID: traceIDFromContext(ctx), + Model: resolvedModelName(ctx, resp), + SystemPrompt: systemPrompt, + Messages: marshalMessages(messages), + ToolDefinitions: marshalToolDefs(toolbox), + DurationMs: now.Sub(startTime).Milliseconds(), + StartedAt: startTime, + CompletedAt: now, + CreatedAt: now, + } + + if callErr != nil { + span.Error = callErr.Error() + } + + if resp != nil { + span.ResponseText = resp.Text() + span.InputTokens = resp.Usage.InputTokens + span.OutputTokens = resp.Usage.OutputTokens + if len(resp.ToolCalls) > 0 { + span.ResponseToolCalls = marshalToolCalls(resp.ToolCalls) + } + } + + if len(callResults) > 0 { + span.ToolResults = marshalCallResults(callResults) + } + + traceSink.WriteSpan(span) +} + +// --- Serialization helpers --- + +type jsonMessage struct { + Role string `json:"role"` + Text string `json:"text,omitempty"` + ToolCallID string `json:"tool_call_id,omitempty"` + ImageCount int `json:"image_count,omitempty"` +} + +func marshalMessages(msgs []llm.Message) string { + out := make([]jsonMessage, 0, len(msgs)) + for _, m := range msgs { + jm := jsonMessage{ + Role: string(m.Role), + Text: m.Text(), + } + for _, p := range m.Parts { + if _, ok := p.(llm.ImagePart); ok { + jm.ImageCount++ + } + } + if len(m.ToolResults) > 0 { + jm.ToolCallID = m.ToolResults[0].ID + } + out = append(out, jm) + } + b, _ := json.Marshal(out) + return string(b) +} + +type jsonToolCall struct { + ID string `json:"id"` + Name string `json:"name"` + Arguments string `json:"arguments"` +} + +func marshalToolCalls(calls []llm.ToolCall) string { + out := make([]jsonToolCall, 0, len(calls)) + for _, c := range calls { + out = append(out, jsonToolCall{ + ID: c.ID, + Name: c.Name, + Arguments: string(c.Arguments), + }) + } + b, _ := json.Marshal(out) + return string(b) +} + +type jsonCallResult struct { + Name string `json:"name"` + Arguments string `json:"arguments"` + Result string `json:"result"` + Error string `json:"error,omitempty"` +} + +func marshalCallResults(results []CallResult) string { + out := make([]jsonCallResult, 0, len(results)) + for _, r := range results { + jr := jsonCallResult{ + Name: r.Name, + Arguments: r.Arguments, + Result: r.Result, + } + if r.Error != nil { + jr.Error = r.Error.Error() + } + out = append(out, jr) + } + b, _ := json.Marshal(out) + return string(b) +} + +type jsonToolDef struct { + Name string `json:"name"` + Description string `json:"description"` +} + +func marshalToolDefs(tb *llm.Toolbox) string { + if tb == nil { + return "" + } + tools := tb.Tools() + if len(tools) == 0 { + return "" + } + out := make([]jsonToolDef, 0, len(tools)) + for _, t := range tools { + out = append(out, jsonToolDef{ + Name: t.Name, + Description: t.Description, + }) + } + b, _ := json.Marshal(out) + return string(b) +} diff --git a/model/cloud_sync.go b/model/cloud_sync.go new file mode 100644 index 0000000..ae28512 --- /dev/null +++ b/model/cloud_sync.go @@ -0,0 +1,453 @@ +// V15.4 — Ollama Cloud dynamic context-length sync. +// +// Why: the static map in context_limits.go has to be hand-maintained +// for every new Ollama Cloud model. Cloud ships new models monthly, +// and a missing entry silently disables compaction for runs on that +// model (compactionThresholdForModel returns 0 on MaxContextTokens +// miss). Dynamic sync removes the maintenance burden and means new +// cloud models work out-of-the-box. +// +// How: at boot, mort kicks off a CloudOllamaLimitCache.RefreshAll in a +// background goroutine. RefreshAll calls /api/tags to list every +// available cloud model, then concurrently calls /api/show for each +// to extract `.context_length` from the response's model_info +// map. The cache is consulted by the executor's +// compactionThresholdForModel via the cache-aware +// MaxContextTokensWithCache helper. +// +// Periodic refresh: a daily ticker re-runs RefreshAll so newly +// released models surface without a mort restart. The interval is +// intentionally not configurable — cloud model context lengths don't +// change for a given tag (only the tag pointer can move, e.g. :cloud +// → larger model), so daily is conservative. + +package model + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "strings" + "sync" + "time" +) + +// defaultCloudEndpoint is the public Ollama Cloud base URL. Override +// in tests via NewCloudOllamaLimitCache's endpoint arg. +const defaultCloudEndpoint = "https://ollama.com" + +// CloudOllamaLimitCache holds context-length values for Ollama Cloud +// models, populated dynamically via /api/tags + /api/show. Construct +// with NewCloudOllamaLimitCache. Safe for concurrent use. +// +// Empty when OLLAMA_API_KEY is unset — Refresh returns a clear error +// and the cache stays empty. Lookups return (0, false) and callers +// fall back to the static map / disabled compaction. +type CloudOllamaLimitCache struct { + mu sync.RWMutex + limit map[string]int + negative map[string]time.Time // model → fetch-failure time (for TTL) + + endpoint string + apiKey string + httpClient *http.Client + + // refreshConcurrency caps the number of concurrent /api/show calls + // during RefreshAll. Default 8 — enough to finish a ~50-model + // catalog in well under a minute without hammering Cloud. + refreshConcurrency int + + // negativeTTL is how long a /api/show miss is cached before we + // retry. Prevents hammering Cloud on a typo or recently-removed + // model. Default 10 minutes. + negativeTTL time.Duration +} + +// NewCloudOllamaLimitCache constructs a fresh cache. apiKey can be +// empty — RefreshAll then returns an error and the cache stays empty. +// endpoint defaults to https://ollama.com when empty. httpClient +// defaults to a 15s-timeout client. +func NewCloudOllamaLimitCache(endpoint, apiKey string, httpClient *http.Client) *CloudOllamaLimitCache { + if strings.TrimSpace(endpoint) == "" { + endpoint = defaultCloudEndpoint + } + endpoint = strings.TrimRight(endpoint, "/") + if httpClient == nil { + httpClient = &http.Client{Timeout: 15 * time.Second} + } + return &CloudOllamaLimitCache{ + limit: make(map[string]int), + negative: make(map[string]time.Time), + endpoint: endpoint, + apiKey: apiKey, + httpClient: httpClient, + refreshConcurrency: 8, + negativeTTL: 10 * time.Minute, + } +} + +// SetNegativeTTL overrides the negative-cache lifetime. Tests use this +// to control retry behaviour without sleeping. +func (c *CloudOllamaLimitCache) SetNegativeTTL(d time.Duration) { + if c == nil || d < 0 { + return + } + c.mu.Lock() + c.negativeTTL = d + c.mu.Unlock() +} + +// Lookup returns the cached context length for an Ollama Cloud model +// name (e.g. "qwen3.5:cloud", "qwen3-coder:480b"). Returns (0, false) +// on miss. Lookup never makes HTTP calls — it's the hot path consulted +// by the executor before every run. +// +// modelName accepts either the bare model:tag form or the prefixed +// "ollama-cloud/model:tag" form; the prefix is stripped. +func (c *CloudOllamaLimitCache) Lookup(modelName string) (int, bool) { + if c == nil { + return 0, false + } + key := stripCloudPrefix(modelName) + if key == "" { + return 0, false + } + c.mu.RLock() + defer c.mu.RUnlock() + v, ok := c.limit[key] + return v, ok +} + +// Size returns the number of cached entries. Useful for logging / +// health checks. +func (c *CloudOllamaLimitCache) Size() int { + if c == nil { + return 0 + } + c.mu.RLock() + defer c.mu.RUnlock() + return len(c.limit) +} + +// LookupOrFetch returns the cached context length OR, on miss, makes a +// single /api/show call to populate the cache. Negative results +// (model not found, /api/show returns no context_length) are cached +// for negativeTTL to prevent hammering Cloud on a typo. Returns +// (0, false) when the model is genuinely unknown and (size, true) on +// any successful resolve. +// +// Why this exists: Ollama Cloud's /api/tags lists canonical model +// names only (e.g. "qwen3.5:397b") but accepts aliases on /api/show +// (e.g. "qwen3.5:cloud" → same 397b model). The boot-time RefreshAll +// only sees the canonical names, so common aliases miss the cache. +// LookupOrFetch fills the gap. +// +// The cache is therefore self-healing: any unknown model gets one +// live /api/show call, the result lands in the cache, and subsequent +// runs hit immediately. Periodic RefreshAll overwrites everything +// with the canonical-name results but additionally-fetched aliases +// linger as positive entries. +func (c *CloudOllamaLimitCache) LookupOrFetch(ctx context.Context, modelName string) (int, bool) { + if c == nil { + return 0, false + } + key := stripCloudPrefix(modelName) + if key == "" { + return 0, false + } + // Fast path: positive hit. + c.mu.RLock() + if v, ok := c.limit[key]; ok { + c.mu.RUnlock() + return v, true + } + // Negative cache check. + if t, ok := c.negative[key]; ok && time.Since(t) < c.negativeTTL { + c.mu.RUnlock() + return 0, false + } + c.mu.RUnlock() + // No API key configured → can't fetch. Don't write a negative + // entry (when the key gets configured later we want the next call + // to re-try immediately). + if strings.TrimSpace(c.apiKey) == "" { + return 0, false + } + // Slow path: live /api/show. + n, err := c.fetchContextLength(ctx, key) + if err != nil || n <= 0 { + slog.Debug("cloud limit cache: lazy fetch miss", + "model", key, "err", err) + c.mu.Lock() + c.negative[key] = time.Now() + c.mu.Unlock() + return 0, false + } + c.set(key, n) + slog.Info("cloud limit cache: lazy fetch hit", "model", key, "context_length", n) + return n, true +} + +// set stores a context length. n <= 0 is a no-op. +func (c *CloudOllamaLimitCache) set(modelName string, n int) { + if c == nil || n <= 0 { + return + } + key := stripCloudPrefix(modelName) + if key == "" { + return + } + c.mu.Lock() + c.limit[key] = n + c.mu.Unlock() +} + +// RefreshAll queries /api/tags then concurrently calls /api/show for +// every listed model, populating the cache. Returns the number of +// models successfully cached and the first error encountered (a +// /api/tags failure aborts; individual /api/show failures are logged +// but don't abort the whole refresh). +// +// Safe to call repeatedly. Cache entries are overwritten with the +// fresh values; entries for models that have been removed from Cloud +// are NOT pruned (cheap to keep; pruning risks dropping an entry just +// before a run that needs it). +func (c *CloudOllamaLimitCache) RefreshAll(ctx context.Context) (int, error) { + if c == nil { + return 0, fmt.Errorf("cloud limit cache: nil receiver") + } + if strings.TrimSpace(c.apiKey) == "" { + return 0, fmt.Errorf("cloud limit cache: OLLAMA_API_KEY unset") + } + tags, err := c.fetchTags(ctx) + if err != nil { + return 0, fmt.Errorf("cloud limit cache: /api/tags: %w", err) + } + + concurrency := c.refreshConcurrency + if concurrency <= 0 { + concurrency = 8 + } + sem := make(chan struct{}, concurrency) + var wg sync.WaitGroup + var ( + mu sync.Mutex + success int + ) + for _, name := range tags { + name := name + wg.Add(1) + sem <- struct{}{} + go func() { + defer wg.Done() + defer func() { <-sem }() + ctxLen, ferr := c.fetchContextLength(ctx, name) + if ferr != nil { + slog.Debug("cloud limit cache: /api/show miss", + "model", name, "err", ferr) + return + } + c.set(name, ctxLen) + mu.Lock() + success++ + mu.Unlock() + }() + } + wg.Wait() + slog.Info("cloud limit cache: refresh complete", + "models_total", len(tags), "cached", success) + return success, nil +} + +// StartPeriodicRefresh runs RefreshAll once immediately, then on every +// interval tick. Cancellation via ctx stops the loop. Logs each +// outcome; never returns an error to the caller (this is a background +// task — failures are warnings, not show-stoppers). +// +// Typical usage: a goroutine spawned at mort boot. +// +// go cache.StartPeriodicRefresh(ctx, 24*time.Hour) +func (c *CloudOllamaLimitCache) StartPeriodicRefresh(ctx context.Context, interval time.Duration) { + if c == nil { + return + } + if interval <= 0 { + interval = 24 * time.Hour + } + doOne := func() { + n, err := c.RefreshAll(ctx) + if err != nil { + slog.Warn("cloud limit cache: refresh failed", + "err", err, "cached_size", c.Size()) + return + } + slog.Info("cloud limit cache: refreshed", + "newly_cached_or_updated", n, "cached_size", c.Size()) + } + doOne() + t := time.NewTicker(interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + doOne() + } + } +} + +// fetchTags calls GET /api/tags and returns the model names. +func (c *CloudOllamaLimitCache) fetchTags(ctx context.Context) ([]string, error) { + url := c.endpoint + "/api/tags" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, err + } + c.applyAuth(req) + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + if resp.StatusCode/100 != 2 { + return nil, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(body, 400)) + } + var parsed struct { + Models []struct { + Name string `json:"name"` + Model string `json:"model"` + } `json:"models"` + } + if err := json.Unmarshal(body, &parsed); err != nil { + return nil, fmt.Errorf("parse /api/tags: %w", err) + } + out := make([]string, 0, len(parsed.Models)) + for _, m := range parsed.Models { + name := m.Name + if name == "" { + name = m.Model + } + if name == "" { + continue + } + out = append(out, name) + } + return out, nil +} + +// fetchContextLength calls POST /api/show for a model and extracts +// the largest *.context_length value from model_info. Returns the +// length and nil on success; (0, err) on any failure. +// +// Why "largest" rather than family-keyed: the family field in the +// /api/show response is sometimes empty or doesn't match the +// model_info key prefix exactly (Ollama Cloud returns the +// architecture as the prefix, which usually but not always matches +// `family`). Scanning for any `*.context_length` is robust. +func (c *CloudOllamaLimitCache) fetchContextLength(ctx context.Context, modelName string) (int, error) { + url := c.endpoint + "/api/show" + body, _ := json.Marshal(map[string]string{"name": modelName}) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body)) + if err != nil { + return 0, err + } + req.Header.Set("Content-Type", "application/json") + c.applyAuth(req) + resp, err := c.httpClient.Do(req) + if err != nil { + return 0, err + } + defer resp.Body.Close() + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return 0, err + } + if resp.StatusCode/100 != 2 { + return 0, fmt.Errorf("status %d: %s", resp.StatusCode, truncate(respBody, 400)) + } + n, err := parseContextLengthJSON(respBody) + if err != nil { + return 0, err + } + return n, nil +} + +// parseContextLengthJSON extracts the largest `*.context_length` int +// from an /api/show response body. Exported-ish (lowercase but tested +// in the same package) so the unit test can exercise it without +// spinning up an httptest server. +func parseContextLengthJSON(body []byte) (int, error) { + var parsed struct { + ModelInfo map[string]any `json:"model_info"` + } + if err := json.Unmarshal(body, &parsed); err != nil { + return 0, fmt.Errorf("parse: %w", err) + } + best := 0 + for k, v := range parsed.ModelInfo { + if !strings.HasSuffix(k, ".context_length") { + continue + } + n := toInt(v) + if n > best { + best = n + } + } + if best <= 0 { + return 0, fmt.Errorf("no context_length in model_info") + } + return best, nil +} + +// toInt coerces a JSON-decoded value to int. Handles float64 (the +// json default) and json.Number; returns 0 for anything else. +func toInt(v any) int { + switch x := v.(type) { + case float64: + return int(x) + case int: + return x + case int64: + return int(x) + case json.Number: + if n, err := x.Int64(); err == nil { + return int(n) + } + } + return 0 +} + +// applyAuth sets the Bearer token when an API key is configured. +func (c *CloudOllamaLimitCache) applyAuth(req *http.Request) { + if strings.TrimSpace(c.apiKey) == "" { + return + } + req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(c.apiKey)) +} + +// stripCloudPrefix strips an "ollama-cloud/" prefix (and surrounding +// whitespace). Returns the bare model:tag form. +func stripCloudPrefix(s string) string { + s = strings.TrimSpace(s) + if strings.HasPrefix(s, "ollama-cloud/") { + s = s[len("ollama-cloud/"):] + } + return s +} + +// truncate caps a byte slice for error messages. +func truncate(b []byte, n int) string { + if len(b) <= n { + return string(b) + } + return string(b[:n]) + "...(truncated)" +} diff --git a/model/context_limits.go b/model/context_limits.go new file mode 100644 index 0000000..c17b5a1 --- /dev/null +++ b/model/context_limits.go @@ -0,0 +1,224 @@ +// V15.2 — per-model context-window limits. +// +// Why: agents need to know when they're about to blow the model's +// max-input cap so they can compact stale tool results out of the +// message history. Pre-15.2 the agent loop had no awareness; a long +// research run that accumulated dozens of HTTP tool results would +// hit Ollama's HTTP 400 "prompt is too long" or Anthropic's similar +// error mid-run with no graceful degradation. +// +// Coverage: +// - Anthropic Claude 4.x (200K default; 1M when the model ID +// includes the "[1m]" suffix per llms.tier reload conventions) +// - OpenAI GPT-4.x / o-series (128K) +// - Gemini 2.x (1M-2M, model-specific) +// - Ollama Cloud (model-specific; hardcoded per known model) +// - Local Ollama: queries `/api/show` once at first use, caches +// +// Returns (0, false) for unknown models — callers should treat +// "unknown" as "don't budget" (the agent's existing iteration cap + +// timeout are the fallback safety nets). + +package model + +import ( + "context" + "strings" + "sync" +) + +// MaxContextTokens returns the model's max INPUT context-window size +// in tokens. Output / response tokens are NOT included — most models +// share input + output budget but cap them separately, and the practical +// concern is "how big can my prompt get before the model rejects". +// +// modelID accepts both the bare model name (`claude-sonnet-4-6`) and +// the prefixed form (`anthropic/claude-sonnet-4-6` or +// `ollama-cloud/qwen3-coder:480b`). The prefix is stripped before lookup. +// +// Returns (limit, true) on a known model; (0, false) otherwise. +// +// This function is pure (no I/O). For Ollama Cloud models that aren't +// in the static map, use MaxContextTokensWithCache which consults a +// CloudOllamaLimitCache populated at boot from /api/tags + /api/show. +func MaxContextTokens(modelID string) (int, bool) { + id := normalizeModelID(modelID) + if v, ok := staticContextLimits[id]; ok { + return v, true + } + // Anthropic 1M-context variant marker. Mort's llms tier system + // uses a `[1m]` suffix on the model ID (e.g. + // `claude-opus-4-7[1m]`) to opt into Anthropic's 1M beta context. + if strings.HasSuffix(id, "[1m]") { + return 1_000_000, true + } + // Local-ollama dynamic lookup is wired separately so it can + // query the daemon's /api/show endpoint. The static map covers + // known cloud models. + return 0, false +} + +// MaxContextTokensWithCache is the cache-aware variant of +// MaxContextTokens. It tries the static map first; on miss, if the +// model is an Ollama Cloud spec (the `ollama-cloud/` prefix), it +// consults the supplied CloudOllamaLimitCache. Pass nil cache for +// static-only behaviour (equivalent to MaxContextTokens). +// +// This function never makes HTTP calls — the cache must be +// pre-populated (typically via cache.RefreshAll at boot). Callers in +// the hot path can rely on a single map lookup per call. Prefer +// MaxContextTokensResolving when a context is available — it makes a +// single /api/show call to fill the cache on miss, which is essential +// for Cloud aliases that /api/tags doesn't enumerate (e.g. :cloud). +func MaxContextTokensWithCache(modelID string, cloud *CloudOllamaLimitCache) (int, bool) { + if v, ok := MaxContextTokens(modelID); ok { + return v, true + } + if cloud == nil { + return 0, false + } + // Only ollama-cloud/* models are eligible for the cache. + id := strings.TrimSpace(modelID) + if !strings.HasPrefix(id, "ollama-cloud/") { + // Also allow bare model:tag form when the caller has already + // stripped the prefix (some test paths). + if strings.Contains(id, "/") { + return 0, false + } + } + return cloud.Lookup(id) +} + +// MaxContextTokensResolving is the cache-aware variant that ALSO +// performs a live /api/show fetch on cache miss (with negative caching +// to prevent thrash). Use this in run-setup paths where one HTTP call +// per unseen model is acceptable — typically the skill executor's +// compaction threshold computation. The fetched result is cached for +// future calls, so subsequent runs hit the in-memory map. +// +// Falls back to the static-only path when the model isn't an +// ollama-cloud/* spec or cache is nil. ctx cancellation aborts the +// fetch and returns (0, false) without writing a negative entry. +func MaxContextTokensResolving(ctx context.Context, modelID string, cloud *CloudOllamaLimitCache) (int, bool) { + if v, ok := MaxContextTokens(modelID); ok { + return v, true + } + if cloud == nil { + return 0, false + } + id := strings.TrimSpace(modelID) + if !strings.HasPrefix(id, "ollama-cloud/") { + if strings.Contains(id, "/") { + return 0, false + } + } + return cloud.LookupOrFetch(ctx, id) +} + +// normalizeModelID strips provider prefix and reasoning suffix so a +// lookup keyed on the base name works regardless of caller form. +// +// Examples: +// - "anthropic/claude-sonnet-4-6" → "claude-sonnet-4-6" +// - "ollama-cloud/qwen3-coder:480b" → "qwen3-coder:480b" +// - "claude-opus-4-7:high" → "claude-opus-4-7" +func normalizeModelID(id string) string { + id = strings.TrimSpace(id) + if idx := strings.Index(id, "/"); idx >= 0 { + id = id[idx+1:] + } + // Strip :low/:medium/:high reasoning effort suffix used by some + // OpenAI / Anthropic clients. + for _, suffix := range []string{":low", ":medium", ":high"} { + if strings.HasSuffix(id, suffix) { + id = id[:len(id)-len(suffix)] + break + } + } + return id +} + +// staticContextLimits is the source of truth for known cloud models. +// Add new entries when adding a model to the llms tier system. +// +// CRITICAL: keep these in sync with the actual provider docs. A wrong +// number here causes EITHER premature compaction (too low, degrades +// agent quality unnecessarily) OR HTTP 400 mid-run (too high). The +// 410K-token failure on `qwen3-coder:480b` is the kind of bug a +// mistyped value would reintroduce. +var staticContextLimits = map[string]int{ + // Anthropic Claude 4.x — default 200K input. 1M variant via + // `[1m]` suffix handled in MaxContextTokens above. + "claude-opus-4-7": 200_000, + "claude-opus-4-6": 200_000, + "claude-opus-4-5": 200_000, + "claude-sonnet-4-6": 200_000, + "claude-sonnet-4-5": 200_000, + "claude-haiku-4-5": 200_000, + "claude-haiku-4-5-20251001": 200_000, + + // OpenAI GPT-4.x / o-series — 128K input. + "gpt-4o": 128_000, + "gpt-4o-mini": 128_000, + "gpt-4-turbo": 128_000, + "o1": 200_000, + "o1-mini": 128_000, + "o3-mini": 200_000, + "gpt-5": 400_000, + "gpt-5-mini": 400_000, + + // Gemini — varies dramatically by model. + "gemini-2.5-pro": 2_000_000, + "gemini-2.5-flash": 1_000_000, + "gemini-2.5-flash-lite": 1_000_000, + "gemini-1.5-pro": 2_000_000, + "gemini-1.5-flash": 1_000_000, + + // Ollama Cloud (turbo). Limits per https://ollama.com/cloud/models + // — verified against the Ollama API show output for each model. + // Update when Ollama publishes new models or extends contexts. + "qwen3-coder:480b": 262_144, // 262K — matches the v15.2 trace + "qwen3:235b": 262_144, + "qwen3:32b": 131_072, + "qwen2.5:72b": 131_072, + "gpt-oss:120b": 131_072, + "gpt-oss:20b": 131_072, + "deepseek-v3.1:671b": 131_072, + "glm-4.6:355b": 131_072, + "kimi-k2:1t": 262_144, + "llama4:scout": 10_000_000, // Llama 4 Scout claims 10M + "llama4:maverick": 1_000_000, +} + +// LocalOllamaLimitCache holds the resolved /api/show context_length per +// local-ollama model. Populated on first lookup; never invalidated +// (changing num_ctx requires an ollama restart anyway). Process-wide, +// no per-tenant scoping needed. +type LocalOllamaLimitCache struct { + mu sync.RWMutex + limit map[string]int +} + +// NewLocalOllamaLimitCache constructs a fresh cache. +func NewLocalOllamaLimitCache() *LocalOllamaLimitCache { + return &LocalOllamaLimitCache{limit: make(map[string]int)} +} + +// Get returns the cached limit or (0, false) when unseen. The caller +// is expected to follow up with a lookup against the live daemon. +func (c *LocalOllamaLimitCache) Get(model string) (int, bool) { + c.mu.RLock() + defer c.mu.RUnlock() + v, ok := c.limit[model] + return v, ok +} + +// Set records a resolved limit. Idempotent; no-op when value is <= 0. +func (c *LocalOllamaLimitCache) Set(model string, n int) { + if n <= 0 { + return + } + c.mu.Lock() + defer c.mu.Unlock() + c.limit[model] = n +} diff --git a/model/inversion_test.go b/model/inversion_test.go new file mode 100644 index 0000000..ea2c4b5 --- /dev/null +++ b/model/inversion_test.go @@ -0,0 +1,97 @@ +package model + +import ( + "testing" + "time" +) + +// mapSource is a tiny config.Source for tests: a key->value map, defaults +// returned for misses. +type mapSource map[string]string + +func (m mapSource) String(k, d string) string { + if v, ok := m[k]; ok { + return v + } + return d +} +func (m mapSource) Int(string, int) int { panic("unused") } +func (m mapSource) Float(string, float64) float64 { panic("unused") } +func (m mapSource) Bool(string, bool) bool { panic("unused") } + +// TestConfigureTierResolution covers the convar->config.Source inversion: the +// host supplies a tier table (names + fallbacks) and a live config source; the +// config value overrides the fallback, and an absent key falls back. +func TestConfigureTierResolution(t *testing.T) { + Configure( + mapSource{"model.tier.fast": "anthropic/claude-haiku-4-5"}, + map[string]string{"fast": "openai/gpt-4o-mini", "thinking": "anthropic/claude-opus-4-8"}, + time.Minute, + ) + defer Configure(nil, nil, 0) // reset package global + + if !IsTierName("fast") || !IsTierName("thinking") { + t.Fatal("configured tiers should be registered") + } + if IsTierName("nope") { + t.Fatal("unknown tier must not report as a tier") + } + if names := TierNames(); len(names) != 2 || names[0] != "fast" || names[1] != "thinking" { + t.Fatalf("TierNames = %v, want sorted [fast thinking]", names) + } + + // config value overrides the host fallback + if spec, _, ok := defaultResolver.Resolve("fast"); !ok || spec != "anthropic/claude-haiku-4-5" { + t.Fatalf("fast resolve = %q ok=%v; config should override fallback", spec, ok) + } + // fallback used when config has no override for the key + if spec, _, ok := defaultResolver.Resolve("thinking"); !ok || spec != "anthropic/claude-opus-4-8" { + t.Fatalf("thinking resolve = %q ok=%v; should use fallback", spec, ok) + } + // unknown tier + if _, _, ok := defaultResolver.Resolve("nope"); ok { + t.Fatal("Resolve of unknown tier should be ok=false") + } +} + +// TestReasoningSuffixOnTier verifies the reasoning-suffix dialect survives the +// move: a tier whose spec carries ":high" yields the bare spec + level "high". +func TestReasoningSuffixOnTier(t *testing.T) { + Configure(nil, map[string]string{"thinking": "anthropic/claude-opus-4-8:high"}, time.Minute) + defer Configure(nil, nil, 0) + + spec, level, ok := defaultResolver.Resolve("thinking") + if !ok { + t.Fatal("thinking should resolve") + } + if spec != "anthropic/claude-opus-4-8" { + t.Errorf("spec = %q, want suffix stripped", spec) + } + if level != "high" { + t.Errorf("reasoning level = %q, want high", level) + } +} + +func TestValidateTierValueRejectsNestedTier(t *testing.T) { + Configure(nil, map[string]string{"fast": "x/y"}, time.Minute) + defer Configure(nil, nil, 0) + + if err := ValidateTierValue("fast,a/b"); err == nil { + t.Error("a chain containing a tier alias must be rejected") + } + if err := ValidateTierValue("a/b,c/d"); err != nil { + t.Errorf("a chain of concrete specs must validate, got %v", err) + } +} + +// TestSinksDefaultNil verifies usage/trace recording is inert with no sinks +// installed (the light-host default). +func TestSinksDefaultNil(t *testing.T) { + SetUsageSink(nil) + SetTraceSink(nil) + if TraceSinkActive() { + t.Error("no trace sink should mean inactive") + } + // recordUsage must be a no-op (no panic) with a nil sink. + recordUsage(WithModel(t.Context(), "x"), nil) +} diff --git a/model/lane_mapping.go b/model/lane_mapping.go new file mode 100644 index 0000000..a5454e3 --- /dev/null +++ b/model/lane_mapping.go @@ -0,0 +1,91 @@ +// Package llms — lane_mapping.go: maps a model spec to a stable lane +// name. Pure data + a single function; no dependency on the registry, +// no provider wrapping. Kept separate from lane_transport.go so the +// mapping table can be committed and reviewed in isolation, and so +// admin / webui code that just wants to *display* lane assignments +// doesn't drag in the transport machinery. +// +// Why a fixed table: provider concurrency caps differ — Ollama Pro is +// 3 connections, Anthropic Claude has higher per-tier limits, etc. +// Each provider gets its own lane name so they can be configured +// independently via convars (lanes..max_concurrent). Lane names +// are user-facing (admin dashboard + convar key suffixes) and need to +// stay stable across deploys; an env-overridable map adds complexity +// for no current benefit. +// +// Test: lane_transport_test.go covers TestLaneFor_Mapping. +package model + +import "strings" + +// Lane name constants. Defined as exported strings so admin code (.skill +// admin set-lane ), webui dropdowns, and convar consumers +// share a single canonical spelling. +const ( + // LaneOllama covers ollama-cloud/* (and any future ollama/* local). + // The local ollama instance is on the same physical resource as + // the cloud account from mort's perspective — the connection cap + // should apply jointly. + LaneOllama = "ollama" + + // LaneAnthropicThinking is the lane for Anthropic models in + // extended-thinking mode. Separated from default because thinking + // requests hold connections longer and can starve faster lanes + // when multiplexed. + LaneAnthropicThinking = "anthropic-thinking" + + // LaneAnthropicDefault is the lane for non-thinking Anthropic + // requests (haiku, sonnet, opus without -thinking-). + LaneAnthropicDefault = "anthropic-default" + + // LaneM1 is the lane for m1/* models (foreman-style router + // pointing at a dedicated local instance). Separated from the + // ollama lane because m1 targets a distinct host with its own + // connection budget. + LaneM1 = "m1" + + // LaneLLMDefault is the catch-all lane for any provider/model + // combination not explicitly mapped above. + LaneLLMDefault = "llm-default" +) + +// LaneFor returns the lane name for the given model spec. Mapping: +// +// ollama-cloud/* → "ollama" (Pro account: 3 connections) +// anthropic/*-thinking-* → "anthropic-thinking" +// anthropic/* → "anthropic-default" +// (anything else) → "llm-default" +// +// Tier aliases (fast/standard/thinking) flow through this function as +// the resolver's expanded provider/model spec, so callers don't need +// to think about tier indirection. Empty input falls through to +// LaneLLMDefault rather than panicking — defensive against unset +// model specs in edge-case test wiring. +// +// Substring match for "-thinking-" keeps future Anthropic naming +// variations classified correctly without churning this table on +// every model release. +func LaneFor(modelSpec string) string { + s := strings.TrimSpace(modelSpec) + if strings.HasPrefix(s, "ollama-cloud/") { + return LaneOllama + } + if strings.HasPrefix(s, "anthropic/") { + if strings.Contains(s, "-thinking-") { + return LaneAnthropicThinking + } + return LaneAnthropicDefault + } + // Foreman instances are backed by Ollama and share its connection + // cap, so they route to the same lane. + if strings.HasPrefix(s, "foreman/") { + return LaneOllama + } + // m1/ is a foreman-style router pointing at a dedicated local + // instance with its own connection budget. Separate lane so its + // concurrency cap is independent of the shared ollama lane. + if strings.HasPrefix(s, "m1/") { + return LaneM1 + } + return LaneLLMDefault +} diff --git a/model/lane_transport.go b/model/lane_transport.go new file mode 100644 index 0000000..153040b --- /dev/null +++ b/model/lane_transport.go @@ -0,0 +1,373 @@ +// Package llms — lane_transport.go: the lane-aware decorator. Wraps an +// llm.Provider so every model it mints submits its Generate/Stream calls +// through the matching named lane's bounded worker pool (lane selection +// per lane_mapping.go), and stamps every returned error with per-call +// attribution (caller id, run id, prompt snapshot) for the failover log. +// +// Why intercept at the llm.Provider layer: majordomo's Provider and Model +// are small public interfaces, so the decorator slots between the chain +// executor and the real provider with no fork. Every chain attempt calls +// laneModel.Generate, which queues on the lane, runs the real call, and +// wraps failures with CallInfo — the ChainConfig.Observer (which receives +// no context) recovers the attribution from the error itself. +// +// Test: lane_transport_test.go covers mapping correctness, the +// concurrency-limiting behavior, and error attribution. +// lane_chatbot_test.go is the regression guard proving chatbot-path LLM +// calls actually go through the lane. +package model + +import ( + "context" + "errors" + "time" + + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "github.com/google/uuid" + + "gitea.stevedudenhoeffer.com/steve/executus/lane" +) + +// defaultLaneExecTimeout is the execution backstop applied inside a lane +// job once it leaves the queue: the caller's deadline is detached (queue +// wait must not consume the LLM execution budget) and replaced with this +// hard cap so a hung provider can't leak workers. +const defaultLaneExecTimeout = 5 * time.Minute + +// foremanModelTimeout is the hard per-call timeout for foreman targets — +// slow local LLMs that may block on model loads and upstream queues. +const foremanModelTimeout = 30 * time.Minute + +// foremanLaneExecTimeout is the lane execution backstop for foreman +// targets. Slightly above foremanModelTimeout so the model-level timeout +// (the documented contract) is the one that fires. +const foremanLaneExecTimeout = foremanModelTimeout + time.Minute + +// laneCallerKey is the context key for the per-call caller identity used +// for fair-share queueing. +type laneCallerKey struct{} + +// runIDKey is the context key for the per-call run id used for failover +// event attribution. +type runIDKey struct{} + +// ContextWithLaneCaller attaches a caller identity to ctx. The lane +// decorator reads this when constructing a Job so fair-share queueing +// can isolate heavy users, and snapshots it into error attribution for +// the failover log. +// +// Empty string is a no-op and lumps every empty-caller invocation into a +// single fair-share bucket; production callers should always populate it. +func ContextWithLaneCaller(ctx context.Context, callerID string) context.Context { + if callerID == "" { + return ctx + } + return context.WithValue(ctx, laneCallerKey{}, callerID) +} + +// LaneCallerFromContext returns the caller identity attached via +// ContextWithLaneCaller, or "" if none is set. +func LaneCallerFromContext(ctx context.Context) string { + s, _ := ctx.Value(laneCallerKey{}).(string) + return s +} + +// ContextWithRunID attaches a skill/agent run id to ctx. Snapshotted into +// error attribution so failover events can be correlated to runs. +func ContextWithRunID(ctx context.Context, runID string) context.Context { + if runID == "" { + return ctx + } + return context.WithValue(ctx, runIDKey{}, runID) +} + +// RunIDFromContext returns the run id attached via ContextWithRunID, or +// "" if none is set. +func RunIDFromContext(ctx context.Context) string { + s, _ := ctx.Value(runIDKey{}).(string) + return s +} + +// --------------------------------------------------------------------------- +// Error attribution +// --------------------------------------------------------------------------- + +// CallInfo is the per-call attribution snapshot the lane decorator stamps +// onto every error it returns. majordomo's ChainConfig.Observer receives +// a bare FailoverEvent (no context); the failover log recovers caller, +// run id, and the prompt chain from the event's error via +// CallInfoFromError. +type CallInfo struct { + // CallerID is the fair-share caller identity (ContextWithLaneCaller). + CallerID string + // RunID is the skill/agent run id (ContextWithRunID); "" if not threaded. + RunID string + // Messages is the request's message chain at call time, for the + // failover log's persist_prompts feature. + Messages []llm.Message +} + +// callInfoError carries CallInfo along an error chain without changing +// the error's message or classification (Unwrap preserves errors.Is/As). +type callInfoError struct { + inner error + info CallInfo +} + +func (e *callInfoError) Error() string { return e.inner.Error() } +func (e *callInfoError) Unwrap() error { return e.inner } + +// WithCallInfo stamps attribution onto err. nil err returns nil. +func WithCallInfo(err error, info CallInfo) error { + if err == nil { + return nil + } + return &callInfoError{inner: err, info: info} +} + +// CallInfoFromError extracts the attribution stamped by the lane +// decorator (or WithCallInfo), if any. +func CallInfoFromError(err error) (CallInfo, bool) { + var cie *callInfoError + if errors.As(err, &cie) { + return cie.info, true + } + return CallInfo{}, false +} + +// --------------------------------------------------------------------------- +// Lane decoration +// --------------------------------------------------------------------------- + +// LaneRegistry is the narrow surface the lane decorator needs from +// pkg/lane.Registry. Defined as an interface so tests can substitute a +// fake registry without spinning up a real one. +type LaneRegistry interface { + GetOrCreate(ctx context.Context, name string) lane.Lane +} + +// laneProvider decorates an llm.Provider so every model it mints routes +// calls through the lane named by LaneFor(provider/model). With a nil +// registry the queueing is skipped but error attribution still applies. +type laneProvider struct { + inner llm.Provider + registry LaneRegistry + execTimeout time.Duration +} + +// WrapProviderForLane returns a provider whose models submit each +// Generate/Stream call through the lane named by LaneFor(name/model) in +// the registry, and stamp CallInfo attribution onto every error. +// +// A nil registry disables queueing (calls pass straight through) but the +// decoration — and with it error attribution — remains, so failover +// logging works in lane-less deployments and tests. +func WrapProviderForLane(inner llm.Provider, registry LaneRegistry) llm.Provider { + return wrapProviderForLane(inner, registry, defaultLaneExecTimeout) +} + +func wrapProviderForLane(inner llm.Provider, registry LaneRegistry, execTimeout time.Duration) llm.Provider { + if inner == nil { + return nil + } + if execTimeout <= 0 { + execTimeout = defaultLaneExecTimeout + } + return &laneProvider{inner: inner, registry: registry, execTimeout: execTimeout} +} + +func (p *laneProvider) Name() string { return p.inner.Name() } + +func (p *laneProvider) Model(id string, opts ...llm.ModelOption) (llm.Model, error) { + m, err := p.inner.Model(id, opts...) + if err != nil { + return nil, err + } + return &laneModel{ + inner: m, + registry: p.registry, + laneName: LaneFor(p.inner.Name() + "/" + id), + execTimeout: p.execTimeout, + }, nil +} + +// laneModel routes one model's calls through its lane and stamps error +// attribution. The lane name is resolved once at Model() time — the +// provider name and model id are both known there, unlike legacy gollm where +// the request had to be inspected per call. +type laneModel struct { + inner llm.Model + registry LaneRegistry + laneName string + execTimeout time.Duration +} + +func (m *laneModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() } + +// laneJob adapts an in-flight call to the lane.Job interface. The result +// is captured into the struct and read after SubmitWait returns. +type laneJob struct { + id string + callerID string + run func(ctx context.Context) error +} + +func (j *laneJob) ID() string { return j.id } +func (j *laneJob) CallerID() string { return j.callerID } +func (j *laneJob) Priority() int { return 0 } +func (j *laneJob) Run(ctx context.Context) error { return j.run(ctx) } + +func (m *laneModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) { + // Fold options now so the job closure and the attribution snapshot + // both see the final request. + req = req.Apply(opts...) + info := CallInfo{ + CallerID: LaneCallerFromContext(ctx), + RunID: RunIDFromContext(ctx), + Messages: req.Messages, + } + + resp, err := m.submit(ctx, func(execCtx context.Context) (*llm.Response, error) { + return m.inner.Generate(execCtx, req) + }) + if err != nil { + return resp, WithCallInfo(err, info) + } + return resp, nil +} + +func (m *laneModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) { + req = req.Apply(opts...) + info := CallInfo{ + CallerID: LaneCallerFromContext(ctx), + RunID: RunIDFromContext(ctx), + Messages: req.Messages, + } + + l := m.lane(ctx) + if l == nil { + s, err := m.inner.Stream(ctx, req) + if err != nil { + return nil, WithCallInfo(err, info) + } + return s, nil + } + + // Streams hold their lane slot only while ESTABLISHING the stream — + // holding it for the full consumption would deadlock a slow consumer + // against the pool. The caller's ctx is used as-is (no deadline + // detach): severing cancellation from a long-lived stream would leak + // connections. + var ( + stream llm.Stream + serr error + ) + job := &laneJob{ + id: uuid.New().String(), + callerID: info.CallerID, + run: func(context.Context) error { + stream, serr = m.inner.Stream(ctx, req) + return serr + }, + } + if err := l.SubmitWait(ctx, job); err != nil { + return nil, WithCallInfo(err, info) + } + if serr != nil { + return nil, WithCallInfo(serr, info) + } + return stream, nil +} + +// lane resolves the lane for this model, or nil when queueing is +// disabled (nil registry, or a registry that declines the name). +func (m *laneModel) lane(ctx context.Context) lane.Lane { + if m.registry == nil { + return nil + } + return m.registry.GetOrCreate(ctx, m.laneName) +} + +// submit runs fn through the lane (or directly when queueing is off). +// +// Inside a lane job the caller's deadline is detached so queue wait does +// not consume the execution budget — ctx VALUES (usage attribution, +// trace ids) are preserved, only cancellation/deadline are severed — and +// an execTimeout backstop prevents runaway calls. Queue-phase +// cancellation still works: SubmitWait waits on the original ctx, so a +// caller that gives up while queued exits immediately. +func (m *laneModel) submit(ctx context.Context, fn func(context.Context) (*llm.Response, error)) (*llm.Response, error) { + l := m.lane(ctx) + if l == nil { + return fn(ctx) + } + var ( + resp *llm.Response + err error + ) + job := &laneJob{ + id: uuid.New().String(), + callerID: LaneCallerFromContext(ctx), + run: func(context.Context) error { + execCtx, cancel := context.WithTimeout(context.WithoutCancel(ctx), m.execTimeout) + defer cancel() + resp, err = fn(execCtx) + // Returning err lets the lane's pool propagate it to + // SubmitWait; the captured err is what we surface. + return err + }, + } + if serr := l.SubmitWait(ctx, job); serr != nil && err == nil { + return nil, serr + } + return resp, err +} + +// --------------------------------------------------------------------------- +// Model timeout decoration (foreman) +// --------------------------------------------------------------------------- + +// timeoutProvider wraps a provider so every minted model enforces a hard +// per-call deadline on Generate. Used for foreman targets (slow local +// LLMs). Stream is passed through: a wall-clock deadline on a long-lived +// stream would sever it mid-consumption. +type timeoutProvider struct { + inner llm.Provider + timeout time.Duration +} + +// withModelTimeout decorates p so its models' Generate calls carry a +// hard timeout. +func withModelTimeout(p llm.Provider, d time.Duration) llm.Provider { + if p == nil || d <= 0 { + return p + } + return &timeoutProvider{inner: p, timeout: d} +} + +func (p *timeoutProvider) Name() string { return p.inner.Name() } + +func (p *timeoutProvider) Model(id string, opts ...llm.ModelOption) (llm.Model, error) { + m, err := p.inner.Model(id, opts...) + if err != nil { + return nil, err + } + return &timeoutModel{inner: m, timeout: p.timeout}, nil +} + +type timeoutModel struct { + inner llm.Model + timeout time.Duration +} + +func (m *timeoutModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() } + +func (m *timeoutModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) { + ctx, cancel := context.WithTimeout(ctx, m.timeout) + defer cancel() + return m.inner.Generate(ctx, req, opts...) +} + +func (m *timeoutModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) { + return m.inner.Stream(ctx, req, opts...) +} diff --git a/model/llms.go b/model/llms.go new file mode 100644 index 0000000..cf49c6c --- /dev/null +++ b/model/llms.go @@ -0,0 +1,477 @@ +// Package model is executus's config-driven model-access layer over majordomo: it owns the +// package-level *majordomo.Registry (providers with mort's env keys, +// OpenAI-compat presets, lane-aware decoration, the DB-backed tier +// resolver, legacy shortcut aliases, the foreman timeout decorator, and +// failover/health wiring), plus the mort-facing call helpers +// (ParseModelRequest / ParseModelForContext / GenerateWith / +// CallAndExecute / SimpleCall) and usage/trace recording. +// +// The ":low/:medium/:high" reasoning-suffix dialect is an executus convenience: +// majordomo treats model ids as verbatim, so this package strips the +// suffix from specs and tier values and re-applies it per request via +// llm.WithReasoningEffort on a wrapping Model. +package model + +import ( + "context" + "fmt" + "os" + "strings" + "sync" + "time" + + majordomo "gitea.stevedudenhoeffer.com/steve/majordomo" + "gitea.stevedudenhoeffer.com/steve/majordomo/health" + "gitea.stevedudenhoeffer.com/steve/majordomo/llm" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/anthropic" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/google" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/ollama" + "gitea.stevedudenhoeffer.com/steve/majordomo/provider/openai" +) + +// Usage and trace recording live in sink.go: SetUsageSink / SetTraceSink +// install the host seams, and ParseModelForContext stamps the model name on +// the context (via WithModel) for attribution. + +// --------------------------------------------------------------------------- +// Package registry +// --------------------------------------------------------------------------- + +// buildConfig carries the knobs Wire feeds into buildRegistry. The zero +// value yields a lane-less registry with majordomo's default failover +// behavior — the bootstrap state tests and pre-Wire code paths run on. +type buildConfig struct { + lanes LaneRegistry + + // maxRetries maps the llms.failover.max_retries convar onto + // ChainConfig.TransientRetries. <= 0 keeps majordomo's default (1). + maxRetries int + + // cooldown maps the llms.failover.cooldown_seconds convar onto + // health.Config.BaseCooldown. <= 0 keeps the mort default (300s). + // Note majordomo grows the cooldown exponentially from this base; + // MaxCooldown is set to max(cooldown, 5m) so the operator dial + // dominates (a 10m base never gets capped below itself). + cooldown time.Duration + + // observer receives one event per failover decision (failed attempt, + // bench, benched-skip). Typically failoverlog.NewObserver(...). + observer func(majordomo.FailoverEvent) +} + +// defaultFailoverCooldown matches the historical llms.failover.cooldown_seconds +// convar default (300s). +const defaultFailoverCooldown = 300 * time.Second + +var ( + registryMu sync.RWMutex + registry = buildRegistry(buildConfig{}) +) + +// Registry returns the current package-level majordomo registry. Most +// callers should use ParseModelRequest / ParseModelForContext instead; +// the registry itself is exposed for admin surfaces (health/bench) and +// for tests that need to substitute providers. +func Registry() *majordomo.Registry { + registryMu.RLock() + defer registryMu.RUnlock() + return registry +} + +// Health returns the health tracker of the current registry — the live +// source of truth for benched models. Used by the `.failover` commands +// and the failover web UI (see ListBenched/BenchModel/UnbenchModel for +// the mort-flavored facade). +func Health() *health.Tracker { + return Registry().Health() +} + +// setRegistry swaps the package registry. Bench/backoff state of the old +// registry is discarded — Wire is a boot-time operation. +func setRegistry(r *majordomo.Registry) { + registryMu.Lock() + defer registryMu.Unlock() + registry = r +} + +// buildRegistry constructs a fully-wired majordomo registry: +// +// - health/chain config from the failover convars (via cfg), +// - mort's providers under their nonstandard env keys (OPENAI_KEY, +// GOOGLE_GEMINI_API_KEY, ...), every one lane-decorated, +// - OpenAI-compat presets (deepseek, moonshot+kimi, xai+grok, groq), +// - scheme factories for LLM_* env DSNs re-registered so DSN-defined +// providers (m1, arbitrary foreman targets) are lane-decorated too, +// with foreman additionally getting the 30-minute model timeout, +// - the legacy shortcut aliases, and +// - the delegating tier resolver (reads defaultResolver at Resolve +// time, so Init() can swap in the DB-backed resolver later). +func buildRegistry(cfg buildConfig) *majordomo.Registry { + cooldown := cfg.cooldown + if cooldown <= 0 { + cooldown = defaultFailoverCooldown + } + maxCooldown := cooldown + if maxCooldown < 5*time.Minute { + maxCooldown = 5 * time.Minute + } + + r := majordomo.New( + // Env DSNs are loaded manually below, AFTER the scheme factories + // are overridden — New()'s eager scan would otherwise build + // LLM_*-defined providers with the stock (un-decorated) factories. + majordomo.WithoutEnvProviders(), + majordomo.WithHealthConfig(health.Config{ + BaseCooldown: cooldown, + MaxCooldown: maxCooldown, + }), + majordomo.WithChainConfig(majordomo.ChainConfig{ + TransientRetries: cfg.maxRetries, + // legacy gollm failed over on request-specific errors (400/413/422) + // without benching; majordomo fails fast on permanent errors by + // default. AdvanceOnPermanent preserves the availability-first + // behavior mort's executors rely on. + AdvanceOnPermanent: true, + Observer: cfg.observer, + }), + ) + + wrap := func(p llm.Provider) llm.Provider { + return wrapProviderForLane(p, cfg.lanes, defaultLaneExecTimeout) + } + + // Core providers with mort's env keys. + r.RegisterProvider(wrap(openai.New( + openai.WithAPIKey(os.Getenv("OPENAI_KEY")), + ))) + r.RegisterProvider(wrap(anthropic.New( + anthropic.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY")), + ))) + r.RegisterProvider(wrap(google.New( + google.WithAPIKey(os.Getenv("GOOGLE_GEMINI_API_KEY")), + ))) + r.RegisterProvider(wrap(localOllamaProvider())) + // ollama.Cloud reads OLLAMA_API_KEY itself; with the key unset the + // provider still registers and errors clearly at call time (parity + // with the previous behavior). + r.RegisterProvider(wrap(ollama.Cloud())) + + // OpenAI-compatible presets. Base URLs mirror legacy gollm's defaults. + for _, preset := range []struct { + name, baseURL, envKey string + }{ + {"deepseek", "https://api.deepseek.com/v1", "DEEPSEEK_API_KEY"}, + {"moonshot", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, + {"kimi", "https://api.moonshot.ai/v1", "MOONSHOT_API_KEY"}, // alias provider for moonshot + {"xai", "https://api.x.ai/v1", "XAI_API_KEY"}, + {"grok", "https://api.x.ai/v1", "XAI_API_KEY"}, // alias provider for xai + {"groq", "https://api.groq.com/openai/v1", "GROQ_API_KEY"}, + } { + r.RegisterProvider(wrap(openai.New( + openai.WithName(preset.name), + openai.WithBaseURL(preset.baseURL), + openai.WithAPIKey(os.Getenv(preset.envKey)), + ))) + } + + // Scheme factories for LLM_* env DSNs. Re-registered so DSN-defined + // providers go through the lane decorator like the built-ins. + // + // foreman targets are slow local LLMs (large model loads, queued + // behind other requests), so their models additionally get a hard + // 30-minute timeout and a matching lane execution backstop — the + // default 5-minute lane backstop would strangle them. + r.RegisterScheme("foreman", func(name string, dsn majordomo.DSN) (llm.Provider, error) { + p := ollama.Foreman(dsn.BaseURL(), dsn.Token, ollama.WithName(name)) + return wrapProviderForLane( + withModelTimeout(p, foremanModelTimeout), + cfg.lanes, + foremanLaneExecTimeout, + ), nil + }) + laneScheme := func(factory majordomo.SchemeFactory) majordomo.SchemeFactory { + return func(name string, dsn majordomo.DSN) (llm.Provider, error) { + p, err := factory(name, dsn) + if err != nil { + return nil, err + } + return wrap(p), nil + } + } + ollamaScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { + return ollama.New( + ollama.WithName(name), + ollama.WithBaseURL(dsn.BaseURL()), + ollama.WithToken(dsn.Token), + ), nil + }) + r.RegisterScheme("ollama", ollamaScheme) + r.RegisterScheme("ollama-cloud", ollamaScheme) + r.RegisterScheme("openai", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { + return openai.New( + openai.WithName(name), + openai.WithBaseURL(dsn.BaseURL()), + openai.WithAPIKey(dsn.Token), + ), nil + })) + r.RegisterScheme("anthropic", laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { + return anthropic.New( + anthropic.WithName(name), + anthropic.WithBaseURL(dsn.BaseURL()), + anthropic.WithAPIKey(dsn.Token), + ), nil + })) + googleScheme := laneScheme(func(name string, dsn majordomo.DSN) (llm.Provider, error) { + return google.New( + google.WithName(name), + google.WithBaseURL(dsn.BaseURL()), + google.WithAPIKey(dsn.Token), + ), nil + }) + r.RegisterScheme("google", googleScheme) + r.RegisterScheme("gemini", googleScheme) + + // Eager LLM_* env scan, now with the decorated scheme factories in + // place. Malformed entries are recorded per-name and surface on use. + env := make(map[string]string) + for _, kv := range os.Environ() { + if k, v, ok := strings.Cut(kv, "="); ok { + env[k] = v + } + } + _ = r.LoadEnv(env) + + // Legacy shortcut aliases (sonnet, haiku, ...). Same strings as the + // historical table; kept in sync with legacyAliasSpecs below. + for name, spec := range legacyAliasSpecs { + r.RegisterAlias(name, spec) + } + + // Tier resolver: a delegating closure so Init() and test helpers can + // swap defaultResolver without rebuilding the registry. The resolver + // returns specs with the legacy reasoning suffixes already stripped + // (per chain element); the tier's default reasoning level is applied + // by ParseModelRequest, not here. + r.RegisterResolver(majordomo.ResolverFunc(func(name string) (string, bool) { + res := defaultResolver + if res == nil { + return "", false + } + spec, _, ok := res.Resolve(name) + return spec, ok + })) + + return r +} + +// localOllamaProvider builds the local Ollama provider, honoring +// OLLAMA_BASE_URL when set (mort's historical env var; ollama.Local +// itself honors OLLAMA_HOST). +func localOllamaProvider() llm.Provider { + if url := os.Getenv("OLLAMA_BASE_URL"); url != "" { + return ollama.Local(ollama.WithBaseURL(url)) + } + return ollama.Local() +} + +// --------------------------------------------------------------------------- +// Spec parsing +// --------------------------------------------------------------------------- + +// ParseModelRequest resolves a model request string to a ready-to-use Model. +// It handles, in order: +// +// - empty spec → tier "fast" +// - the legacy ":low/:medium/:high" reasoning suffix, stripped per chain +// element (ollama tags like ":30b" or ":cloud" are preserved); the +// level is applied to every call via llm.WithReasoningEffort +// - tier aliases (DB-backed convars; a tier value's own suffix becomes +// the default level when the caller didn't supply one) +// - legacy shortcut aliases (sonnet, haiku, opus, ...) +// - provider/model lookup and LLM_* env-DSN fallback (majordomo) +// - comma-separated failover chains with health-tracked bench/backoff +// +// The returned Model is instrumented: token usage from every successful +// Generate is recorded to the package usage recorder automatically. Do +// NOT additionally call RecordUsage on responses from a parsed model. +func ParseModelRequest(spec string) (majordomo.Model, error) { + spec = strings.TrimSpace(spec) + if spec == "" { + spec = "fast" + } + + clean, level := splitReasoningSpec(spec) + + // Tier default reasoning: when the (suffix-free) spec is exactly a + // tier name and the caller didn't ask for a level, the tier value's + // own suffix (e.g. "anthropic/claude-opus-4-6:high") applies. + if level == "" && defaultResolver != nil { + if _, tierLevel, ok := defaultResolver.Resolve(clean); ok { + level = tierLevel + } + } + + m, err := Registry().Parse(clean) + if err != nil { + return nil, fmt.Errorf("model %q: %w", spec, err) + } + if level != "" { + m = &reasoningModel{inner: m, level: level} + } + return &instrumentedModel{inner: m}, nil +} + +// ParseModelForContext combines ParseModelRequest with llmusage.WithModel so +// that the resolved model name is recorded in the context for usage tracking. +// Prefer this over bare ParseModelRequest in all new code. +func ParseModelForContext(ctx context.Context, req string) (context.Context, majordomo.Model, error) { + model, err := ParseModelRequest(req) + if err != nil { + return ctx, nil, err + } + ctx = WithModel(ctx, ResolveModelName(req)) + return ctx, model, nil +} + +// reasoningModel applies a default reasoning effort to every request that +// doesn't carry one already. Mort's legacy ":low/:medium/:high" suffix +// dialect resolves to this wrapper because majordomo treats model ids as +// verbatim (no suffix stripping). +type reasoningModel struct { + inner llm.Model + level string +} + +func (m *reasoningModel) Generate(ctx context.Context, req llm.Request, opts ...llm.Option) (*llm.Response, error) { + req = req.Apply(opts...) + if req.ReasoningEffort == "" { + req.ReasoningEffort = m.level + } + return m.inner.Generate(ctx, req) +} + +func (m *reasoningModel) Stream(ctx context.Context, req llm.Request, opts ...llm.Option) (llm.Stream, error) { + req = req.Apply(opts...) + if req.ReasoningEffort == "" { + req.ReasoningEffort = m.level + } + return m.inner.Stream(ctx, req) +} + +func (m *reasoningModel) Capabilities() llm.Capabilities { return m.inner.Capabilities() } + +// --------------------------------------------------------------------------- +// Reasoning-suffix dialect +// --------------------------------------------------------------------------- + +// reasoningLevels is the set of recognized legacy suffix values. +var reasoningLevels = map[string]bool{"low": true, "medium": true, "high": true} + +// splitReasoning peels an optional ":low" / ":medium" / ":high" suffix off +// a single model request string. Returns the input unchanged and "" when no +// recognized level is present, so non-reasoning suffixes (ollama tags like +// ":30b" or ":q4_K_M", date stamps) flow through untouched. +func splitReasoning(s string) (string, string) { + idx := strings.LastIndex(s, ":") + if idx < 0 { + return s, "" + } + if lvl := s[idx+1:]; reasoningLevels[lvl] { + return s[:idx], lvl + } + return s, "" +} + +// splitReasoningSpec strips the legacy reasoning suffix from every element +// of a (possibly comma-separated) spec. The returned level is the first +// non-empty per-element level — majordomo chains carry one request-level +// reasoning effort, not one per target, so the head element's preference +// wins. Elements without a suffix are unchanged. +func splitReasoningSpec(spec string) (string, string) { + if !strings.Contains(spec, ",") { + return splitReasoning(strings.TrimSpace(spec)) + } + parts := strings.Split(spec, ",") + level := "" + for i, p := range parts { + s, l := splitReasoning(strings.TrimSpace(p)) + parts[i] = s + if level == "" { + level = l + } + } + return strings.Join(parts, ","), level +} + +// --------------------------------------------------------------------------- +// Usage-attribution name resolution +// --------------------------------------------------------------------------- + +// ResolveModelName returns the model portion of a request string, stripping +// any reasoning suffix and resolving tier aliases. The result is used for +// usage attribution (keyed on model name, not provider or reasoning level). +func ResolveModelName(req string) string { + // Strip any reasoning-level suffix before resolving — the level is a + // per-request setting, not part of the model identity. + req, _ = splitReasoning(req) + + // Tier expansion: when the request is a tier alias, fold it through the + // resolver and return the model portion of its current convar value. The + // empty string is treated as "fast" for compatibility with callers that + // pre-resolution defaulted to fast. + if defaultResolver != nil { + key := req + if key == "" { + key = "fast" + } + if spec, _, ok := defaultResolver.Resolve(key); ok && spec != "" { + // A tier may resolve to a comma-separated failover chain. Attribute + // usage to the first (preferred) entry's model name rather than the + // whole chain string. + if i := strings.IndexByte(spec, ','); i >= 0 { + spec = strings.TrimSpace(spec[:i]) + } + if idx := strings.Index(spec, "/"); idx >= 0 { + return spec[idx+1:] + } + return spec + } + } + + // For non-tier requests, return the model portion after the slash. + // Static aliases are NOT expanded here beyond the legacy table below: + // callers that went through ParseModelRequest already carry the + // concrete spec. + if idx := strings.Index(req, "/"); idx >= 0 { + return req[idx+1:] + } + + // Legacy shortcut fallback: callers that pass bare names like "sonnet" + // to ResolveModelName (without going through ParseModelRequest) still + // need the concrete model name for usage keys. + if spec, ok := legacyAliasSpecs[req]; ok { + if idx := strings.Index(spec, "/"); idx >= 0 { + return spec[idx+1:] + } + return spec + } + + return req +} + +// legacyAliasSpecs maps legacy shortcut names to their full provider/model +// spec. Registered with the registry as static aliases AND consulted by +// ResolveModelName for bare-name usage attribution. +var legacyAliasSpecs = map[string]string{ + "openai": "openai/gpt-4o-mini", + "gpt-4": "openai/gpt-4", + "gpt-4o": "openai/gpt-4o", + "gpt-4o-mini": "openai/gpt-4o-mini", + "sonnet": "anthropic/claude-sonnet-4-6", + "sonnet-4.5": "anthropic/claude-sonnet-4-5-20250929", + "haiku": "anthropic/claude-haiku-4-5-20251001", + "opus": "anthropic/claude-opus-4-6", + "gemini": "google/gemini-2.0-flash", + "gemini-flash": "google/gemini-2.0-flash", + "gemini-pro": "google/gemini-2.0-pro", +} diff --git a/model/sink.go b/model/sink.go new file mode 100644 index 0000000..47575e2 --- /dev/null +++ b/model/sink.go @@ -0,0 +1,131 @@ +package model + +import ( + "context" + "time" +) + +// This file is executus's inversion of mort's llmusage / llmtrace coupling. +// The model package owns the MECHANISM (instrument every parsed model's +// Generate, attribute by serving model, emit a span when a trace is active); +// WHERE usage/traces land is a host seam. A host registers a UsageSink and/or +// a TraceSink; both are optional (nil = off), so a light host records nothing. + +// --- Usage --- + +// UsageSink receives one record per successful Generate through a model parsed +// by this package (ParseModelRequest / ParseModelForContext). Implement it to +// meter or bill; the token detail mirrors majordomo's Response.Usage. +type UsageSink interface { + Record(ctx context.Context, model string, inputTokens, outputTokens, cacheReadTokens, cacheWriteTokens int) +} + +var usageSink UsageSink + +// SetUsageSink installs the usage sink (nil disables usage recording). Call at +// startup before model calls. +func SetUsageSink(s UsageSink) { usageSink = s } + +// --- Trace --- + +// Span is one traced model call. The host's TraceSink persists it however it +// likes (a DB row, a log line, an OTel span). String fields carrying structured +// data (Messages, ToolDefinitions, ...) are pre-marshalled JSON. +type Span struct { + SpanID string + TraceID string + Model string + + SystemPrompt string + Messages string + ToolDefinitions string + ResponseText string + ResponseToolCalls string + ToolResults string + Error string + + InputTokens int + OutputTokens int + DurationMs int64 + + StartedAt time.Time + CompletedAt time.Time + CreatedAt time.Time +} + +// TraceSink receives a Span for each traced call (one is emitted only when a +// trace id is present on the context — see WithTraceID). +type TraceSink interface { + WriteSpan(span Span) +} + +var traceSink TraceSink + +// SetTraceSink installs the trace sink (nil disables tracing). +func SetTraceSink(s TraceSink) { traceSink = s } + +// TraceSinkActive reports whether a trace sink is installed. +func TraceSinkActive() bool { return traceSink != nil } + +// --- Context attribution --- +// +// ParseModelForContext stamps the requested model onto the context so usage +// from a response that doesn't name its serving model can still be attributed. +// A host's tracing/usage middleware stamps a trace id and optional caller/tool +// for diagnostics. All reads are nil/empty-safe. + +type ( + ctxKeyModel struct{} + ctxKeyTrace struct{} + ctxKeyTool struct{} + ctxKeyUser struct{} +) + +// WithModel attributes subsequent usage on ctx to the given model name. +func WithModel(ctx context.Context, model string) context.Context { + return context.WithValue(ctx, ctxKeyModel{}, model) +} + +func modelFromContext(ctx context.Context) string { + if v, ok := ctx.Value(ctxKeyModel{}).(string); ok { + return v + } + return "" +} + +// WithTraceID marks ctx as belonging to a trace; a TraceSink (if installed) +// then receives a Span per call. An empty id (or no id) disables tracing. +func WithTraceID(ctx context.Context, id string) context.Context { + return context.WithValue(ctx, ctxKeyTrace{}, id) +} + +func traceIDFromContext(ctx context.Context) string { + if v, ok := ctx.Value(ctxKeyTrace{}).(string); ok { + return v + } + return "" +} + +// WithUsageTool / WithUsageUser attach optional attribution used only in the +// "unknown model" diagnostic warning. Default "unknown". +func WithUsageTool(ctx context.Context, tool string) context.Context { + return context.WithValue(ctx, ctxKeyTool{}, tool) +} + +func toolFromContext(ctx context.Context) string { + if v, ok := ctx.Value(ctxKeyTool{}).(string); ok && v != "" { + return v + } + return "unknown" +} + +func WithUsageUser(ctx context.Context, user string) context.Context { + return context.WithValue(ctx, ctxKeyUser{}, user) +} + +func userFromContext(ctx context.Context) string { + if v, ok := ctx.Value(ctxKeyUser{}).(string); ok && v != "" { + return v + } + return "unknown" +} diff --git a/model/tiers.go b/model/tiers.go new file mode 100644 index 0000000..73bd54f --- /dev/null +++ b/model/tiers.go @@ -0,0 +1,162 @@ +package model + +import ( + "fmt" + "sort" + "strings" + "sync" + "time" + + "gitea.stevedudenhoeffer.com/steve/executus/config" +) + +// tierResolver expands tier aliases (e.g. "fast", "thinking", "agent-working") +// into a concrete model spec or a comma-separated failover chain. The set of +// tier names and their FALLBACK specs are host-supplied (a map passed at +// Configure time); the live value of each tier is read from a config.Source +// under the key "model.tier.", so a host whose config backend mutates at +// runtime (mort's convar) re-targets tiers without a restart, while a static +// host (gadfly's env) just gets the fallback. A small in-process cache (TTL +// from "model.tier.cache_ttl_seconds", default 30s) saves config round-trips on +// the hot path; ReloadTiers clears it. +// +// This is executus's inversion of mort's convar-bound resolver: the MECHANISM +// (tier lookup, reasoning-suffix dialect, chain validation, cache) is generic; +// the tier MAP content (which tiers exist + their default specs) is host config. +type tierResolver struct { + cfg config.Source + defaults map[string]string // tier name -> fallback spec + ttl time.Duration + mu sync.RWMutex + cache map[string]tierEntry + now func() time.Time // overridable for tests +} + +type tierEntry struct { + spec string + reasoning string + expires time.Time +} + +const tierConfigPrefix = "model.tier." + +// NewTierResolver builds a resolver over cfg with the given tier defaults +// (name -> fallback spec). cfg may be nil (the fallbacks are then always used). +// ttl<=0 reads "model.tier.cache_ttl_seconds" (default 30s). +func NewTierResolver(cfg config.Source, defaults map[string]string, ttl time.Duration) *tierResolver { + if ttl <= 0 { + ttl = time.Duration(config.Int(cfg, tierConfigPrefix+"cache_ttl_seconds", 30)) * time.Second + } + if ttl <= 0 { + ttl = 30 * time.Second + } + cp := make(map[string]string, len(defaults)) + for k, v := range defaults { + cp[k] = v + } + return &tierResolver{ + cfg: cfg, + defaults: cp, + ttl: ttl, + cache: make(map[string]tierEntry), + now: time.Now, + } +} + +func (r *tierResolver) has(name string) bool { + _, ok := r.defaults[name] + return ok +} + +func (r *tierResolver) names() []string { + out := make([]string, 0, len(r.defaults)) + for k := range r.defaults { + out = append(out, k) + } + sort.Strings(out) + return out +} + +// Resolve returns the current model spec and default reasoning level for a tier +// name. ok=false if name is not a registered tier. Legacy reasoning suffixes +// (":low/:medium/:high") are stripped per chain element; the first non-empty +// level becomes the tier's default reasoning level (ollama tags like ":cloud" +// pass through). The live value is read from config with the host-supplied +// fallback; an empty resolved value yields ok=true with an empty spec +// (ParseModelRequest surfaces a clear error in that path). +func (r *tierResolver) Resolve(name string) (string, string, bool) { + if !r.has(name) { + return "", "", false + } + now := r.now() + + r.mu.RLock() + if e, hit := r.cache[name]; hit && now.Before(e.expires) { + r.mu.RUnlock() + return e.spec, e.reasoning, true + } + r.mu.RUnlock() + + r.mu.Lock() + defer r.mu.Unlock() + if e, hit := r.cache[name]; hit && now.Before(e.expires) { + return e.spec, e.reasoning, true + } + + raw := strings.TrimSpace(config.String(r.cfg, tierConfigPrefix+name, r.defaults[name])) + spec, level := splitReasoningSpec(raw) + r.cache[name] = tierEntry{spec: spec, reasoning: level, expires: now.Add(r.ttl)} + return spec, level, true +} + +// Reload clears the cache so the next Resolve fetches fresh from config. +func (r *tierResolver) Reload() { + r.mu.Lock() + defer r.mu.Unlock() + r.cache = make(map[string]tierEntry) +} + +// --- package-level resolver + facade --- + +// defaultResolver is initialized as a package-level var (not in init()) so it +// is ready before any other file's init runs — buildRegistry's delegating +// resolver closure reads it at Resolve time. It starts with no tiers; a host +// installs its tier table via Configure. +var defaultResolver = NewTierResolver(nil, nil, 0) + +// Configure installs the host's tier table. cfg is the live config source +// (nil = fallbacks only); defaults maps each tier name to its fallback spec; +// ttl<=0 uses the config'd / 30s default. The package registry's delegating +// resolver reads defaultResolver at Resolve time, so swapping it here is +// sufficient — no registry rebuild needed. +func Configure(cfg config.Source, defaults map[string]string, ttl time.Duration) { + defaultResolver = NewTierResolver(cfg, defaults, ttl) +} + +// TierNames returns the registered tier alias names (sorted). Exported so UI +// layers can populate tier dropdowns without hardcoding. +func TierNames() []string { return defaultResolver.names() } + +// IsTierName reports whether s is a registered tier alias. +func IsTierName(s string) bool { return defaultResolver.has(s) } + +// ReloadTiers clears the package resolver's cache so the next request resolves +// freshly from config. +func ReloadTiers() { defaultResolver.Reload() } + +// ValidateTierValue returns an error if value cannot be used as a tier spec — +// specifically, when a chain entry is itself a tier name (which would form a +// resolution loop). Chain entries must be concrete provider/model specs. +func ValidateTierValue(value string) error { + for _, part := range strings.Split(value, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + spec, _ := splitReasoning(part) + if IsTierName(spec) { + return fmt.Errorf("tier value %q contains tier alias %q (chains must use concrete provider/model specs, not nested tiers)", value, spec) + } + } + return nil +} diff --git a/model/wiring.go b/model/wiring.go new file mode 100644 index 0000000..b880eda --- /dev/null +++ b/model/wiring.go @@ -0,0 +1,110 @@ +// Package llms — wiring.go: the production boot hook that rebuilds the +// package registry with the lane registry, the failover convars, and the +// failover-event observer. +// +// Why a dedicated helper (vs spreading registry construction through +// mort.go): the chatbot regression test in lane_chatbot_test.go and the +// production boot path must call the SAME wiring code. Historically +// mort.go skipped the lane wiring entirely (lanes were defined but never +// installed — 30+ skill_runs in production with 0 skill_queue_jobs rows); +// concentrating the install here means a regression in one wires fails +// the test for the other. +package model + +import ( + "context" + "log/slog" + "time" + + majordomo "gitea.stevedudenhoeffer.com/steve/majordomo" +) + +// WireOptions configures Wire. The zero value rebuilds the registry with +// no lanes and default failover behavior. +type WireOptions struct { + // Lanes is the lane registry every provider is decorated with. nil + // disables lane queueing (calls pass straight through) but keeps + // error attribution for the failover log. + Lanes LaneRegistry + + // FailoverMaxRetries maps the llms.failover.max_retries convar onto + // majordomo's ChainConfig.TransientRetries (same-target retries after + // a transient error). <= 0 keeps majordomo's default (1). + FailoverMaxRetries int + + // FailoverCooldown maps the llms.failover.cooldown_seconds convar + // onto health.Config.BaseCooldown. majordomo grows the cooldown + // exponentially from this base per consecutive bench; the cap is + // max(FailoverCooldown, 5m) so the operator's dial dominates. + // <= 0 keeps the mort default (300s). + FailoverCooldown time.Duration + + // FailoverObserver receives one event per failover decision (failed + // attempt, bench, benched-skip). Wire it to failoverlog.NewObserver. + // Attribution (caller/run/prompts) rides on the event's error — see + // CallInfoFromError. + FailoverObserver func(majordomo.FailoverEvent) +} + +// Wire rebuilds the package registry from opts and installs it. Call once +// at boot, after the lane registry and the failover convars exist (and +// after Init for DB-backed tiers — though Init and Wire are order- +// independent: the tier resolver is consulted through a delegating +// indirection). +// +// Rebuilding discards in-memory health/bench state — Wire is a boot-time +// operation, not a runtime toggle. +// +// When Lanes is non-nil, the well-known lanes (KnownLanes) are eagerly +// registered so admin dashboards have baseline state from the moment mort +// starts instead of "no lanes registered" until the first LLM call. +// +// Returns the installed registry for inspection (tests, health surfaces). +func Wire(ctx context.Context, opts WireOptions) *majordomo.Registry { + r := buildRegistry(buildConfig{ + lanes: opts.Lanes, + maxRetries: opts.FailoverMaxRetries, + cooldown: opts.FailoverCooldown, + observer: opts.FailoverObserver, + }) + setRegistry(r) + + if opts.Lanes != nil { + names := KnownLanes() + for _, name := range names { + opts.Lanes.GetOrCreate(ctx, name) + } + slog.Info("llms: wired lane-aware registry", "lanes", len(names)) + } else { + slog.Warn("llms: Wire called without a lane registry — lane queueing is inert") + } + return r +} + +// KnownLanes returns the well-known lane names the LLM transport resolves +// to. Eager-registering these at boot gives admin dashboards +// (`/skills/admin/queues`, `.skill admin queue`) a baseline view from the +// moment mort starts — without this, the dashboard reads "no lanes +// registered" until the first chatbot/skill call materialises the lane +// via lazy GetOrCreate. +// +// Why this list (and not "every lane name ever"): these are the ones +// LaneFor in lane_mapping.go can produce for a real model spec. Future +// non-LLM lanes (e.g. a future image-generation lane) should be eagerly +// registered by their owning subsystem, not here. +// +// LaneSkillDefault is included even though it isn't an LLM-routing +// lane: skills run through it via skillexec.WithLaneRegistry, and the +// skills admin dashboard needs to see it from boot. +// +// Test: wiring_test.go::TestKnownLanes_NonEmpty. +func KnownLanes() []string { + return []string{ + LaneOllama, + LaneAnthropicThinking, + LaneAnthropicDefault, + LaneM1, + LaneLLMDefault, + "skill-default", + } +}