Files
answer/pkg/answer/search.go
Steve Dudenhoeffer 693ac4e6a7 Add core implementation for AI-powered question answering
Introduce multiple agents, tools, and utilities for processing, extracting, and answering user-provided questions using LLMs and external data. Key features include knowledge processing, question splitting, search term generation, and contextual knowledge handling.
2025-03-21 11:10:48 -04:00

654 lines
16 KiB
Go

package answer
import (
"encoding/json"
"fmt"
"log/slog"
"net/url"
"slices"
"strings"
"time"
"gitea.stevedudenhoeffer.com/steve/answer/pkg/agent"
"gitea.stevedudenhoeffer.com/steve/go-extractor"
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
"gitea.stevedudenhoeffer.com/steve/answer/pkg/search"
gollm "gitea.stevedudenhoeffer.com/steve/go-llm"
)
const (
kMaxLoops = 10
kMaxReads = 10
kMaxLoadMore = 3
)
type searchResults struct {
Url string `json:"url"`
Answer string `json:"answer"`
}
func (s searchResults) String() (string, error) {
b, err := json.Marshal(s)
if err != nil {
return "", err
}
return string(b), nil
}
func pickResult(ctx *gollm.Context, results []search.Result, q Question) (*search.Result, error) {
// if there's only one result, return it
if len(results) == 1 {
return &results[0], nil
}
// if there are no results, return nil
if len(results) == 0 {
return nil, nil
}
var pick *search.Result
var refused bool
// finally, if there are multiple results then ask the LLM to pick one to read next
fnPick := gollm.NewFunction(
"pick",
"The search result to read next.",
func(ctx *gollm.Context, args struct {
URL string `description:"the url to read next"`
}) (string, error) {
for _, r := range results {
if r.URL == args.URL {
pick = &r
break
}
}
return "", nil
})
fnNoPick := gollm.NewFunction(
"no_pick",
"Indicate that there are no results worth reading.",
func(ctx *gollm.Context, args struct {
Ignored string `description:"ignored, just here to make sure the function is called. Fill with anything."`
}) (string, error) {
refused = true
return "", nil
})
req := gollm.Request{
Messages: []gollm.Message{
{
Role: gollm.RoleSystem,
Text: `You are being given results from a web search. Please select the result you would like to read next to answer the question. Try to pick the most reputable and relevant result.
The results will be in the JSON format of: {"Url": "https://url.here", "Title": "Title Of Search", "Description": "description here"}`,
},
{
Role: gollm.RoleSystem,
Text: "The question you are trying to answer is: " + q.Question,
},
},
Toolbox: gollm.NewToolBox(fnPick, fnNoPick),
}
for _, r := range results {
b, _ := json.Marshal(r)
req.Messages = append(req.Messages, gollm.Message{
Role: gollm.RoleUser,
Text: string(b),
})
}
res, err := q.Model.ChatComplete(ctx, req)
if err != nil {
return nil, err
}
if len(res.Choices) == 0 {
return nil, nil
}
if len(res.Choices[0].Calls) == 0 {
return nil, nil
}
_, _ = req.Toolbox.Execute(ctx, res.Choices[0].Calls[0])
if refused {
return nil, nil
}
return pick, nil
}
func internalSearch(ctx *gollm.Context, q Question, searchTerm string) (searchResults, error) {
slog.Info("searching", "search", searchTerm, "question", q)
results, err := q.Search.Search(ctx, searchTerm)
if err != nil {
return searchResults{}, err
}
if len(results) == 0 {
return searchResults{Url: "not-found", Answer: "no search results found"}, nil
}
for len(results) > 0 {
var pick *search.Result
if len(results) == 1 {
pick = &results[0]
results = results[1:]
} else {
var err error
pick, err = pickResult(ctx, results, q)
slog.Info("picked result", "result", pick, "error", err)
if err != nil {
return searchResults{}, err
}
if pick == nil {
break
}
}
trimmed := strings.TrimSpace(pick.URL)
if trimmed == "" {
}
slog.Info("extracting article", "url", trimmed)
u, err := url.Parse(trimmed)
if err != nil {
continue
}
a, err := extractArticle(ctx, q.Cache, u)
if err != nil {
continue
}
slog.Info("extracted article", "url", a.URL, "title", a.Title, "body", a.Body)
if a.Title != "" && a.Body != "" {
answer, err := doesTextAnswerQuestion(ctx, q, a.Body)
if err != nil {
slog.Error("error checking if text answers question", "question", q.Question, "error", err)
continue
}
if answer != "" {
return searchResults{Url: u.String(), Answer: answer}, nil
}
}
}
return searchResults{Url: "not-found", Answer: "no searched results answered"}, nil
}
type searchResults2 struct {
Answer string `json:"answer"`
Urls []string `json:"urls"`
}
func (r searchResults2) String() (string, error) {
b, err := json.Marshal(r)
if err != nil {
return "", err
}
return string(b), nil
}
func functionSearch2(ctx *gollm.Context, q Question, searchTerm string) (searchResults2, error) {
var res searchResults2
browser, ok := ctx.Value("browser").(extractor.Browser)
if !ok {
return searchResults2{}, fmt.Errorf("browser not found in context")
}
cfg := duckduckgo.Config{
SafeSearch: duckduckgo.SafeSearchOff,
Region: "us-en",
}
page, err := cfg.OpenSearch(ctx, browser, searchTerm)
defer deferClose(page)
if err != nil {
return searchResults2{}, fmt.Errorf("failed to open search page: %w", err)
}
var totalNextPage int
var totalRead int
// oldResults are all the old results from the previous pages, so that when we load more we can filter out
// the old results
var oldResults []duckduckgo.Result
filterResults := func(results []duckduckgo.Result) []duckduckgo.Result {
var res []duckduckgo.Result
for _, r := range results {
if r.Title == "" || r.Description == "" {
continue
}
if slices.Contains(oldResults, r) {
continue
}
res = append(res, r)
}
return res
}
a := agent.NewAgent(gollm.Request{
Messages: []gollm.Message{
{
Role: gollm.RoleSystem,
Text: `You are trying to answer a question by reading pages from a search engine.
Use 'read' to read a page. You can only read 10 pages total, so try to only pick high quality pages. Results of a read will be in the format of {"url": "https://url.here", "answer": "answer here"}.
Additionally, you can use 'next_page' to load more results. You can only use next_page 3 times total.
You can read multiple pages at once, or read one page and continue to the next page if you need more information.
But if you are confident in your answer, you can use 'answer' to provide the answer.
Or you can use 'give_up' to indicate that you cannot find an answer and give up.`,
},
{
Role: gollm.RoleSystem,
Text: "The question you are trying to answer is: " + q.Question,
},
{
Role: gollm.RoleSystem,
Text: "The search terms you used were: " + searchTerm,
},
{
Role: gollm.RoleSystem,
Text: `The search results will be provided by the user in json format of: {"url": "https://url.here", "title": "Title Of Page", "description": "description here"}`,
},
},
})
a.Model = q.Model
var giveup bool
addMessages := func(results []duckduckgo.Result) {
type searchResults struct {
Url string `json:"url"`
Title string `json:"title"`
Desc string `json:"description"`
}
for _, r := range results {
b, _ := json.Marshal(&searchResults{Url: r.URL, Title: r.Title, Desc: r.Description})
a.AddMessage(gollm.Message{
Role: gollm.RoleUser,
Text: string(b),
})
}
}
fnRead := gollm.NewFunction(
"read",
`Read a page from the search results. The results will be in the JSON format of: {"url": "https://url.here", "answer": "answer here"}`,
func(ctx *gollm.Context, args struct {
URL string `description:"the url to read"`
}) (string, error) {
slog.Info("read", "url", args.URL)
if totalRead >= kMaxReads {
return "you have read the maximum number of pages", nil
}
totalRead += 1
u, err := url.Parse(args.URL)
if err != nil {
return "", fmt.Errorf("failed to parse url: %w", err)
}
a, err := extractArticle(ctx, q.Cache, u)
slog.Info("extracted article", "url", a.URL, "title", a.Title, "body", a.Body)
if err != nil {
return "", fmt.Errorf("failed to extract article: %w", err)
}
if a.Title == "" || a.Body == "" {
return "couldn't read the page", nil
}
answer, err := doesTextAnswerQuestion(ctx, q, a.Body)
if err != nil {
return "", fmt.Errorf("failed to check if text answers question: %w", err)
}
var res = searchResults{
Url: u.String(),
Answer: answer,
}
return res.String()
})
fnNextPage := gollm.NewFunction(
"next_page",
"Load more results from the search engine.",
func(ctx *gollm.Context, args struct {
Ignored string `description:"ignored, just here to make sure the function is called. Fill with anything."`
}) (string, error) {
if totalNextPage >= kMaxLoadMore {
return "you have loaded the maximum number of pages", nil
}
totalNextPage += 1
err := page.LoadMore()
if err != nil {
return "", fmt.Errorf("failed to load more results: %w", err)
}
time.Sleep(4 * time.Second)
results := page.GetResults()
// only add the new results here...
filteredResults := filterResults(results)
oldResults = append(oldResults, filteredResults...)
addMessages(filteredResults)
return "ok", nil
})
fnAnswer := gollm.NewFunction(
"answer",
"Provide the answer to the question.",
func(ctx *gollm.Context, args struct {
Answer string `description:"the answer to the question"`
Sources []string `description:"the urls of sources used to find the answer"`
}) (string, error) {
res.Answer = args.Answer
res.Urls = args.Sources
giveup = true
return "ok", nil
})
fnGiveUp := gollm.NewFunction(
"give_up",
"Indicate that you cannot find an answer and give up.",
func(ctx *gollm.Context, args struct {
Ignored string `description:"ignored, just here to make sure the function is called. Fill with anything."`
}) (string, error) {
giveup = true
return "ok", nil
})
// do initial load of results
results := page.GetResults()
filteredResults := filterResults(results)
oldResults = append(oldResults, filteredResults...)
addMessages(filteredResults)
var i = 0
for ; i < kMaxLoops && !giveup; i++ {
// figure out my allowed tools, based on limits
var tools = []*gollm.Function{
fnAnswer,
fnGiveUp,
}
if totalRead < kMaxReads {
tools = append(tools, fnRead)
}
if totalNextPage < kMaxLoadMore {
tools = append(tools, fnNextPage)
}
a.ToolBox = gollm.NewToolBox(tools...)
err = a.Execute(ctx, gollm.Message{Role: gollm.RoleSystem, Text: "Now evaluate if the text answers the question, and use a function to either provide the answer or read more pages."})
if err != nil {
return searchResults2{}, fmt.Errorf("failed to run agent: %w", err)
}
}
if giveup {
return res, fmt.Errorf("gave up: no relevant results found")
}
if res.Answer == "" {
return res, fmt.Errorf("no answer found")
}
return res, nil
}
func functionSearch(ctx *gollm.Context, q Question, searchTerm string) (searchResults2, error) {
var res searchResults2
browser, ok := ctx.Value("browser").(extractor.Browser)
if !ok {
return searchResults2{}, fmt.Errorf("browser not found in context")
}
cfg := duckduckgo.Config{
SafeSearch: duckduckgo.SafeSearchOff,
Region: "us-en",
}
page, err := cfg.OpenSearch(ctx, browser, searchTerm)
defer deferClose(page)
if err != nil {
return searchResults2{}, fmt.Errorf("failed to open search page: %w", err)
}
var totalNextPage int
var totalRead int
// oldResults are all the old results from the previous pages, so that when we load more we can filter out
// the old results
var oldResults []duckduckgo.Result
filterResults := func(results []duckduckgo.Result) []duckduckgo.Result {
var res []duckduckgo.Result
for _, r := range results {
if r.Title == "" || r.Description == "" {
continue
}
if slices.Contains(oldResults, r) {
continue
}
res = append(res, r)
}
return res
}
var giveup bool
req := gollm.Request{
Messages: []gollm.Message{
{
Role: gollm.RoleSystem,
Text: `You are trying to answer a question by reading pages from a search engine.
Use 'read' to read a page. You can only read 10 pages total, so try to only pick high quality pages.
Additionally, you can use 'next_page' to load more results. You can only use next_page 3 times total.
You can read multiple pages at once, or read one page and continue to the next page if you need more information.
But if you are confident in your answer, you can use 'answer' to provide the answer.
Or you can use 'give_up' to indicate that you cannot find an answer and give up.`,
},
{
Role: gollm.RoleSystem,
Text: "The question you are trying to answer is: " + q.Question,
},
{
Role: gollm.RoleSystem,
Text: "The search terms you used were: " + searchTerm,
},
{
Role: gollm.RoleSystem,
Text: `The search results will be provided by the user in json format of: {"url": "https://url.here", "title": "Title Of Page", "description": "description here"}`,
},
},
}
addMessages := func(results []duckduckgo.Result) {
type searchResults struct {
Url string `json:"url"`
Title string `json:"title"`
Desc string `json:"description"`
}
for _, r := range results {
b, _ := json.Marshal(&searchResults{Url: r.URL, Title: r.Title, Desc: r.Description})
req.Messages = append(req.Messages, gollm.Message{
Role: gollm.RoleUser,
Text: string(b),
})
}
}
fnRead := gollm.NewFunction(
"read",
`Read a page from the search results. The results will be in the JSON format of: {"url": "https://url.here", "answer": "answer here"}`,
func(ctx *gollm.Context, args struct {
URL string `description:"the url to read"`
}) (string, error) {
if totalRead >= kMaxReads {
return "you have read the maximum number of pages", nil
}
totalRead += 1
u, err := url.Parse(args.URL)
if err != nil {
return "", fmt.Errorf("failed to parse url: %w", err)
}
a, err := extractArticle(ctx, q.Cache, u)
if err != nil {
return "", fmt.Errorf("failed to extract article: %w", err)
}
if a.Title == "" || a.Body == "" {
return "couldn't read the page", nil
}
answer, err := doesTextAnswerQuestion(ctx, q, a.Body)
if err != nil {
return "", fmt.Errorf("failed to check if text answers question: %w", err)
}
var res = searchResults{
Url: u.String(),
Answer: answer,
}
return res.String()
})
fnNextPage := gollm.NewFunction(
"next_page",
"Load more results from the search engine.",
func(ctx *gollm.Context, args struct {
Ignored string `description:"ignored, just here to make sure the function is called. Fill with anything."`
}) (string, error) {
if totalNextPage >= kMaxLoadMore {
return "you have loaded the maximum number of pages", nil
}
totalNextPage += 1
err := page.LoadMore()
if err != nil {
return "", fmt.Errorf("failed to load more results: %w", err)
}
time.Sleep(4 * time.Second)
results := page.GetResults()
// only add the new results here...
filteredResults := filterResults(results)
oldResults = append(oldResults, filteredResults...)
addMessages(filteredResults)
return "ok", nil
})
fnAnswer := gollm.NewFunction(
"answer",
"Provide the answer to the question.",
func(ctx *gollm.Context, args struct {
Answer string `description:"the answer to the question"`
Sources []string `description:"the urls of sources used to find the answer"`
}) (string, error) {
res.Answer = args.Answer
res.Urls = args.Sources
giveup = true
return "ok", nil
})
fnGiveUp := gollm.NewFunction(
"give_up",
"Indicate that you cannot find an answer and give up.",
func(ctx *gollm.Context, args struct {
Ignored string `description:"ignored, just here to make sure the function is called. Fill with anything."`
}) (string, error) {
giveup = true
return "ok", nil
})
// do initial load of results
results := page.GetResults()
filteredResults := filterResults(results)
oldResults = append(oldResults, filteredResults...)
addMessages(filteredResults)
var i = 0
for ; i < kMaxLoops && !giveup; i++ {
// figure out my allowed tools, based on limits
var tools = []*gollm.Function{
fnAnswer,
fnGiveUp,
}
if totalRead < kMaxReads {
tools = append(tools, fnRead)
}
if totalNextPage < kMaxLoadMore {
tools = append(tools, fnNextPage)
}
req.Toolbox = gollm.NewToolBox(tools...)
res, err := q.Model.ChatComplete(ctx, req)
if err != nil {
return searchResults2{}, fmt.Errorf("failed to chat complete: %w", err)
}
if len(res.Choices) == 0 {
break
}
if len(res.Choices[0].Calls) == 0 {
break
}
_, err = req.Toolbox.Execute(ctx, res.Choices[0].Calls[0])
if err != nil {
return searchResults2{}, fmt.Errorf("failed to execute: %w", err)
}
}
if giveup {
return res, fmt.Errorf("gave up: no relevant results found")
}
if res.Answer == "" {
return res, fmt.Errorf("no answer found")
}
return res, nil
}