sync of changes

This commit is contained in:
Steve Dudenhoeffer 2024-11-09 19:50:14 -05:00
parent cc7b03c614
commit a83d5f9822
9 changed files with 491 additions and 95 deletions

View File

@ -1,15 +1,33 @@
package main
import (
"fmt"
"answer/pkg/answer"
"answer/pkg/cache"
"answer/pkg/search"
"context"
gollm "gitea.stevedudenhoeffer.com/steve/go-llm"
"github.com/urfave/cli"
"log/slog"
"os"
"strings"
"time"
)
func getKey(key string, env string) string {
if key != "" {
return key
}
return os.Getenv(env)
}
func main() {
ctx := context.Background()
// Usage: go run cmd/answer.go question...
// - flags:
// --model=[model string such as openai/gpt-4o, anthropic/claude..., google/gemini-1.5. Default: openai/gpt-4o]
// --search-provider=[search provider string such as google, duckduckgo. Default: google]
// --cache-provider=[cache provider string such as memory, redis, file. Default: memory]
var app = cli.App{
Name: "answer",
@ -17,20 +35,123 @@ func main() {
Version: "0.1",
Description: "",
Flags: []cli.Flag{
&cli.StringFlag{
Name: "model",
Value: "openai/gpt-4o",
Usage: "model to use for answering the question, syntax: provider/model such as openai/gpt-4o",
},
&cli.StringFlag{
Name: "llm-key",
Value: "",
Usage: "key for the llm model (if empty, will use env var of PROVIDER_API_KEY, such as OPENAI_API_KEY)",
},
&cli.StringFlag{
Name: "search-provider",
Value: "google",
Usage: "search provider to use for searching the web",
},
&cli.StringFlag{
Name: "cache-provider",
Value: "memory",
Usage: "cache provider to use for caching search results",
},
},
Action: func(c *cli.Context) error {
// if there is no question to answer, print usage
if c.NArg() == 0 {
return cli.ShowAppHelp(c)
}
var question answer.Question
// get the question
fmt.Println("Head: ", c.Args().First())
fmt.Println("Tail: ", c.Args().Tail())
question.Question = strings.Join(c.Args(), " ")
switch c.String("cache-provider") {
case "memory":
panic("not implemented")
case "redis":
panic("not implemented")
case "file":
question.Cache = &cache.Directory{
BaseFolder: "cache",
MaxLife: 1 * 24 * time.Hour,
}
default:
panic("unknown cache provider")
}
if question.Cache == nil {
panic("cache is nil")
}
// wrap the cache in a hasher
question.Cache = cache.ShaWrapper{
Cache: question.Cache,
}
switch c.String("search-provider") {
case "google":
question.Search = search.Google{Cache: question.Cache}
default:
panic("unknown search provider")
}
var llm gollm.LLM
model := c.String("model")
a := strings.Split(model, "/")
if len(a) != 2 {
panic("invalid model, expected: provider/model (such as openai/gpt-4o)")
}
switch a[0] {
case "openai":
llm = gollm.OpenAI(getKey(c.String("llm-key"), "OPENAI_API_KEY"))
case "anthropic":
llm = gollm.Anthropic(getKey(c.String("llm-key"), "ANTHROPI_API_KEY"))
case "google":
llm = gollm.Google(getKey(c.String("llm-key"), "GOOGLE_API_KEY"))
default:
panic("unknown model provider")
}
m, err := llm.ModelVersion(a[1])
if err != nil {
panic(err)
}
question.Model = m
answers, err := answer.Answer(ctx, question)
if err != nil {
panic(err)
}
for i, a := range answers {
slog.Info("answer", "index", i, "answer", a)
}
return nil
},
}
app.Run()
err := app.Run(os.Args)
if err != nil {
slog.Error("Error: ", err)
}
}

25
go.mod
View File

@ -4,6 +4,14 @@ go 1.23.2
replace gitea.stevedudenhoeffer.com/steve/go-llm => ../go-llm
require (
gitea.stevedudenhoeffer.com/steve/go-llm v0.0.0-20241031152103-f603010dee49
github.com/advancedlogic/GoOse v0.0.0-20231203033844-ae6b36caf275
github.com/playwright-community/playwright-go v0.4702.0
github.com/rocketlaunchr/google-search v1.1.6
github.com/urfave/cli v1.22.16
)
require (
cloud.google.com/go v0.115.0 // indirect
cloud.google.com/go/ai v0.8.0 // indirect
@ -11,16 +19,22 @@ require (
cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
cloud.google.com/go/compute/metadata v0.3.0 // indirect
cloud.google.com/go/longrunning v0.5.7 // indirect
gitea.stevedudenhoeffer.com/steve/go-llm v0.0.0-20241031152103-f603010dee49 // indirect
github.com/PuerkitoBio/goquery v1.8.1 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/antchfx/htmlquery v1.3.0 // indirect
github.com/antchfx/xmlquery v1.3.15 // indirect
github.com/antchfx/xpath v1.2.4 // indirect
github.com/araddon/dateparse v0.0.0-20180729174819-cfd92a431d0e // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
github.com/fatih/set v0.2.1 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-resty/resty/v2 v2.0.0 // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/gocolly/colly/v2 v2.1.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
@ -30,21 +44,26 @@ require (
github.com/google/uuid v1.6.0 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.5 // indirect
github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/liushuangls/go-anthropic/v2 v2.8.0 // indirect
github.com/rocketlaunchr/google-search v1.1.6 // indirect
github.com/mattn/go-runewidth v0.0.3 // indirect
github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84 // indirect
github.com/pkg/errors v0.8.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/sashabaranov/go-openai v1.31.0 // indirect
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect
github.com/temoto/robotstxt v1.1.2 // indirect
github.com/urfave/cli v1.22.16 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect
go.opentelemetry.io/otel v1.26.0 // indirect
go.opentelemetry.io/otel/metric v1.26.0 // indirect
go.opentelemetry.io/otel/trace v1.26.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/crypto v0.24.0 // indirect
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
golang.org/x/sync v0.9.0 // indirect

View File

@ -2,14 +2,13 @@ package answer
import (
"answer/pkg/cache"
"answer/pkg/extractor"
"answer/pkg/search"
"context"
"errors"
"fmt"
go_llm "gitea.stevedudenhoeffer.com/steve/go-llm"
"io"
gollm "gitea.stevedudenhoeffer.com/steve/go-llm"
"log/slog"
"net/http"
"net/url"
"strings"
)
@ -22,7 +21,7 @@ type Question struct {
// Question is the question to answer
Question string
Model go_llm.ChatCompletion
Model gollm.ChatCompletion
Search search.Search
@ -58,11 +57,11 @@ type Result struct {
Error error
}
func fanExecuteToolCalls(ctx context.Context, toolBox *go_llm.ToolBox, calls []go_llm.ToolCall) []Result {
func fanExecuteToolCalls(ctx context.Context, toolBox *gollm.ToolBox, calls []gollm.ToolCall) []Result {
var results []Result
var resultsOutput = make(chan Result, len(calls))
fnCall := func(call go_llm.ToolCall) Result {
fnCall := func(call gollm.ToolCall) Result {
str, err := toolBox.Execute(ctx, call)
if err != nil {
return Result{
@ -76,7 +75,7 @@ func fanExecuteToolCalls(ctx context.Context, toolBox *go_llm.ToolBox, calls []g
}
for _, call := range calls {
go func(call go_llm.ToolCall) {
go func(call gollm.ToolCall) {
resultsOutput <- fnCall(call)
}(call)
}
@ -97,7 +96,7 @@ type article struct {
Body string
}
func extractArticle(ctx context.Context, u *url.URL) (res article, err error) {
func extractArticle(ctx context.Context, c cache.Cache, u *url.URL) (res article, err error) {
defer func() {
e := recover()
@ -110,49 +109,38 @@ func extractArticle(ctx context.Context, u *url.URL) (res article, err error) {
}
}()
req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
extractors := extractor.MultiExtractor(
extractor.CacheExtractor{
Cache: c,
Tag: "goose",
Extractor: extractor.GooseExtractor{},
},
extractor.CacheExtractor{
Cache: c,
Tag: "playwright",
Extractor: extractor.PlaywrightExtractor{},
},
)
a, err := extractors.Extract(ctx, u.String())
if err != nil {
return res, fmt.Errorf("error creating request: %w", err)
return article{
URL: "",
Title: "",
Body: "",
}, err
}
resp, err := c.cl.Do(req)
if err != nil {
return res, fmt.Errorf("error getting response: %w", err)
}
defer func(Body io.ReadCloser) {
err := Body.Close()
if err != nil {
slog.Error("error closing body", "error", err)
}
}(resp.Body)
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", fmt.Errorf("bad response: %d: %s", resp.StatusCode, resp.Status)
}
b, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("error reading body: %w", err)
}
g := goose.New()
article, err := g.ExtractFromRawHTML(string(b), target)
if err != nil {
return "", fmt.Errorf("error extracting article: %w", err)
}
return article.CleanedText, nil
panic("not implemented")
return article{}, nil
return article{
URL: a.URL,
Title: a.Title,
Body: a.Body,
}, nil
}
func doesTextAnswerQuestion(ctx context.Context, q Question, text string) (string, error) {
fnAnswer := go_llm.NewFunction(
fnAnswer := gollm.NewFunction(
"answer",
"The answer from the given text that answers the question.",
func(ctx context.Context, args struct {
@ -161,29 +149,29 @@ func doesTextAnswerQuestion(ctx context.Context, q Question, text string) (strin
return args.Answer, nil
})
fnNoAnswer := go_llm.NewFunction(
fnNoAnswer := gollm.NewFunction(
"no_answer",
"Indicate that the text does not answer the question.",
func(ctx context.Context, args struct{}) (string, error) {
return "", nil
})
req := go_llm.Request{
Messages: []go_llm.Message{
req := gollm.Request{
Messages: []gollm.Message{
{
Role: go_llm.RoleSystem,
Role: gollm.RoleSystem,
Text: "Evaluate the given text to see if it answers the question from the user. The text is as follows:",
},
{
Role: go_llm.RoleSystem,
Role: gollm.RoleSystem,
Text: text,
},
{
Role: go_llm.RoleUser,
Role: gollm.RoleUser,
Text: q.Question,
},
},
Toolbox: go_llm.NewToolBox(fnAnswer, fnNoAnswer),
Toolbox: gollm.NewToolBox(fnAnswer, fnNoAnswer),
}
res, err := q.Model.ChatComplete(ctx, req)
@ -224,7 +212,7 @@ func functionSearch(ctx context.Context, q Question, searchTerm string) (string,
continue
}
a, err := extractArticle(ctx, u)
a, err := extractArticle(ctx, q.Cache, u)
if err != nil {
continue
@ -248,7 +236,7 @@ func functionSearch(ctx context.Context, q Question, searchTerm string) (string,
}
func functionThink(ctx context.Context, q Question) (string, error) {
fnAnswer := go_llm.NewFunction(
fnAnswer := gollm.NewFunction(
"answer",
"Answer the question.",
func(ctx context.Context, args struct {
@ -258,18 +246,18 @@ func functionThink(ctx context.Context, q Question) (string, error) {
})
var temp float32 = 0.8
req := go_llm.Request{
Messages: []go_llm.Message{
req := gollm.Request{
Messages: []gollm.Message{
{
Role: go_llm.RoleSystem,
Role: gollm.RoleSystem,
Text: "Answer the given question as accurately and concisely as possible using the answer function.",
},
{
Role: go_llm.RoleUser,
Role: gollm.RoleUser,
Text: q.Question,
},
},
Toolbox: go_llm.NewToolBox(fnAnswer),
Toolbox: gollm.NewToolBox(fnAnswer),
Temperature: &temp,
}
@ -291,7 +279,7 @@ func functionThink(ctx context.Context, q Question) (string, error) {
}
func (o Options) Answer(ctx context.Context, q Question) (Answers, error) {
fnSearch := go_llm.NewFunction(
fnSearch := gollm.NewFunction(
"search",
"Search the web for an answer to a question. You can call this function up to "+fmt.Sprint(o.MaxSearches)+" times.",
func(ctx context.Context, args struct {
@ -304,7 +292,7 @@ func (o Options) Answer(ctx context.Context, q Question) (Answers, error) {
return functionSearch(ctx, q2, args.SearchQuery)
})
fnThink := go_llm.NewFunction(
fnThink := gollm.NewFunction(
"think",
"Think about a question. This is useful for breaking down complex questions into smaller parts that are easier to answer.",
func(ctx context.Context, args struct {
@ -316,7 +304,7 @@ func (o Options) Answer(ctx context.Context, q Question) (Answers, error) {
return functionThink(ctx, q2)
})
fnAnswer := go_llm.NewFunction(
fnAnswer := gollm.NewFunction(
"answer",
"You definitively answer a question, if you call this it means you know the answer and do not need to search for it or use any other function to find it",
func(ctx context.Context, args struct {
@ -325,7 +313,7 @@ func (o Options) Answer(ctx context.Context, q Question) (Answers, error) {
return args.Answer, nil
})
var funcs = []*go_llm.Function{fnAnswer}
var funcs = []*gollm.Function{fnAnswer}
if o.MaxSearches > 0 {
funcs = append(funcs, fnSearch)
@ -337,18 +325,18 @@ func (o Options) Answer(ctx context.Context, q Question) (Answers, error) {
var temp float32 = 0.8
req := go_llm.Request{
Messages: []go_llm.Message{
req := gollm.Request{
Messages: []gollm.Message{
{
Role: go_llm.RoleSystem,
Role: gollm.RoleSystem,
Text: "You are being asked to answer a question. You must respond with a function. You can answer it if you know the answer, or if some functions exist you can use those to help you find the answer.",
},
{
Role: go_llm.RoleUser,
Role: gollm.RoleUser,
Text: q.Question,
},
},
Toolbox: go_llm.NewToolBox(funcs...),
Toolbox: gollm.NewToolBox(funcs...),
Temperature: &temp,
}
@ -366,29 +354,29 @@ func (o Options) Answer(ctx context.Context, q Question) (Answers, error) {
res.Choices = res.Choices[:o.MaxSearches]
}
var answers []QuestionAnswer
choicesOutput := make(chan QuestionAnswer, len(res.Choices))
var answers Answers
choicesOutput := make(chan string, len(res.Choices))
for _, choice := range res.Choices {
fnChoice := func(choice go_llm.ResponseChoice) QuestionAnswer {
var calls []CallResult
var callsOutput = make(chan CallResult, len(choice.Calls))
fnCall := func(call go_llm.ToolCall) CallResult {
fnChoice := func(choice gollm.ResponseChoice) string {
var calls []Result
var callsOutput = make(chan Result, len(choice.Calls))
fnCall := func(call gollm.ToolCall) Result {
str, err := req.Toolbox.Execute(ctx, call)
if err != nil {
return CallResult{
return Result{
Error: err,
}
}
return CallResult{
return Result{
Result: str,
}
}
for _, call := range choice.Calls {
go func(call go_llm.ToolCall) {
go func(call gollm.ToolCall) {
callsOutput <- fnCall(call)
}(call)
}
@ -402,8 +390,12 @@ func (o Options) Answer(ctx context.Context, q Question) (Answers, error) {
}
answers = append(answers, fnChoice(choice))
}
return answers, nil
}
func Answer(ctx context.Context, q Question) (Answers, error) {

51
pkg/cache/cache.go vendored
View File

@ -1,6 +1,16 @@
package cache
import "io"
import (
"crypto/sha256"
"errors"
"fmt"
"io"
)
var (
// ErrNotFound is returned when the key is not found in the cache
ErrNotFound = errors.New("key not found")
)
type Cache interface {
Get(key string, writer io.Writer) error
@ -13,3 +23,42 @@ type Cache interface {
Delete(key string) error
}
type ShaWrapper struct {
Cache Cache
}
func (s ShaWrapper) hash(key string) string {
// hash the key to a sha256
hash := sha256.Sum256([]byte(key))
// return the hex representation of the hash
return fmt.Sprintf("%x", hash)
}
func (s ShaWrapper) Get(key string, writer io.Writer) error {
return s.Cache.Get(s.hash(key), writer)
}
func (s ShaWrapper) GetString(key string) (string, error) {
return s.Cache.GetString(s.hash(key))
}
func (s ShaWrapper) GetJSON(key string, value any) error {
return s.Cache.GetJSON(s.hash(key), value)
}
func (s ShaWrapper) Set(key string, value io.Reader) error {
return s.Cache.Set(s.hash(key), value)
}
func (s ShaWrapper) SetJSON(key string, value any) error {
return s.Cache.SetJSON(s.hash(key), value)
}
func (s ShaWrapper) SetString(key string, value string) error {
return s.Cache.SetString(s.hash(key), value)
}
func (s ShaWrapper) Delete(key string) error {
return s.Cache.Delete(s.hash(key))
}

View File

@ -14,8 +14,7 @@ import (
type Directory struct {
BaseFolder string
MaxLife time.Duration
lock sync.Mutex
lock sync.Mutex
}
var _ Cache = &Directory{}
@ -76,7 +75,16 @@ func (d *Directory) AutoCleanupRoutine(ctx context.Context) error {
func (d *Directory) openFile(key string) (*os.File, error) {
path := d.GetPath(key)
return os.Open(path)
res, err := os.Open(path)
if err != nil {
if os.IsNotExist(err) {
return nil, ErrNotFound
}
return nil, err
}
return res, nil
}
func (d *Directory) Set(key string, value io.Reader) error {

View File

@ -0,0 +1,87 @@
package extractor
import (
"answer/pkg/cache"
"context"
"errors"
)
var ErrFailedToExtract = errors.New("failed to extract")
type Article struct {
URL string
Title string
Body string
}
// Extractor is an interface of systems that can extract the contents of
type Extractor interface {
Extract(ctx context.Context, url string) (Article, error)
}
type multiExtractor struct {
extractors []Extractor
}
var _ Extractor = multiExtractor{}
// Extract will try to extract the contents of a URL using all the extractors, and return the first successful result.
func (m multiExtractor) Extract(ctx context.Context, url string) (Article, error) {
var errs []error
for _, e := range m.extractors {
article, err := e.Extract(ctx, url)
if err == nil {
return article, nil
}
if errors.Is(err, ErrFailedToExtract) {
continue
}
errs = append(errs, err)
}
if len(errs) > 0 {
return Article{}, errors.Join(errs...)
}
return Article{}, ErrFailedToExtract
}
func MultiExtractor(e ...Extractor) Extractor {
return multiExtractor{extractors: e}
}
type CacheExtractor struct {
Cache cache.Cache
Tag string
Extractor Extractor
}
var _ Extractor = CacheExtractor{}
func (c CacheExtractor) Extract(ctx context.Context, url string) (Article, error) {
tag := c.Tag
if tag == "" {
tag = "defaultextractor:"
}
key := tag + ":" + url
var article Article
err := c.Cache.GetJSON(key, &article)
if err == nil {
return article, nil
}
article, err = c.Extractor.Extract(ctx, url)
if err != nil {
return Article{}, err
}
err = c.Cache.SetJSON(key, article)
if err != nil {
return Article{}, err
}
return article, nil
}

25
pkg/extractor/goose.go Normal file
View File

@ -0,0 +1,25 @@
package extractor
import (
"context"
goose "github.com/advancedlogic/GoOse"
)
type GooseExtractor struct {
}
func (GooseExtractor) Extract(ctx context.Context, url string) (Article, error) {
var res = Article{
URL: url,
}
g := goose.New()
article, err := g.ExtractFromURL(url)
if err != nil {
return res, err
}
res.Body = article.CleanedText
res.Title = article.Title
return res, nil
}

View File

@ -0,0 +1,81 @@
package extractor
import (
"context"
"fmt"
"github.com/playwright-community/playwright-go"
"os"
)
type PlaywrightExtractor struct {
}
var _ Extractor = PlaywrightExtractor{}
func getReadabilityJS() (string, error) {
data, err := os.ReadFile("readability.js")
if err != nil {
return "", err
}
return string(data), nil
}
func (p PlaywrightExtractor) Extract(_ context.Context, url string) (Article, error) {
var article = Article{
URL: url,
}
pw, err := playwright.Run()
if err != nil {
return article, err
}
defer pw.Stop()
browser, err := pw.Chromium.Launch()
if err != nil {
return article, err
}
defer browser.Close()
page, err := browser.NewPage()
if err != nil {
return article, err
}
defer page.Close()
_, err = page.Goto(url)
if err != nil {
return article, err
}
// Inject Readability.js
readabilityJS, err := getReadabilityJS()
if err != nil {
return article, err
}
_, err = page.AddScriptTag(playwright.PageAddScriptTagOptions{
Content: &readabilityJS,
})
if err != nil {
return article, err
}
// Run Readability and get the article content
content, err := page.Evaluate(`() => {
let article = new Readability(document).parse();
return article ? article.textContent : null;
}`)
if err != nil {
return article, err
}
text, ok := content.(string)
if !ok {
return article, fmt.Errorf("failed to convert content to string")
}
article.Body = text
article.Title, _ = page.Title()
return article, nil
}

View File

@ -1,18 +1,30 @@
package search
import (
"answer/pkg/cache"
"context"
googlesearch "github.com/rocketlaunchr/google-search"
"sort"
)
type Google struct {
Cache cache.Cache
}
var _ Search = Google{}
func (Google) Search(ctx context.Context, search string) ([]Result, error) {
res, err := googlesearch.Search(ctx, search, googlesearch.SearchOptions{
func (g Google) Search(ctx context.Context, search string) ([]Result, error) {
var res []Result
key := "google:" + search
err := g.Cache.GetJSON(key, &res)
if err == nil {
return res, nil
}
results, err := googlesearch.Search(ctx, search, googlesearch.SearchOptions{
CountryCode: "",
LanguageCode: "",
Limit: 0,
@ -27,18 +39,20 @@ func (Google) Search(ctx context.Context, search string) ([]Result, error) {
return nil, err
}
var results []Result
// just in case, sort the res by rank, as the api does not mention it is sorted
sort.Slice(res, func(i, j int) bool {
return res[i].Rank < res[j].Rank
return results[i].Rank < results[j].Rank
})
for _, r := range res {
results = append(results, Result{
for _, r := range results {
res = append(res, Result{
Title: r.Title,
URL: r.URL,
Description: r.Description,
})
}
_ = g.Cache.SetJSON(key, res)
return res, nil
}