Removed redundant fields and callbacks in the search agent while introducing concurrent processing for reading search results. Updated logic to enhance readability and modularity, including capped reads and streamlined interaction with search results. Adjusted dependencies and related usage to align with the refactored design.
259 lines
7.0 KiB
Go
259 lines
7.0 KiB
Go
package searcher
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"gitea.stevedudenhoeffer.com/steve/answer/pkg/agents/reader"
|
|
"gitea.stevedudenhoeffer.com/steve/answer/pkg/agents/shared"
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor"
|
|
"gitea.stevedudenhoeffer.com/steve/go-extractor/sites/duckduckgo"
|
|
gollm "gitea.stevedudenhoeffer.com/steve/go-llm"
|
|
)
|
|
|
|
type Result struct {
|
|
// Answer is the answer to the question that was asked.
|
|
Answer string
|
|
|
|
// Sources is a list of sources that were used to find the answer.
|
|
Sources []string
|
|
|
|
// Remaining is the remaining part(s) of the question that was not answered.
|
|
Remaining string
|
|
}
|
|
|
|
type Agent struct {
|
|
// Model is the chat completion model to use
|
|
Model gollm.ChatCompletion
|
|
|
|
OnDone func(ctx context.Context, knowledge shared.Knowledge) error
|
|
|
|
// MaxReads is the maximum number of pages that can be read by the agent. Unlimited if <= 0.
|
|
MaxReads int
|
|
|
|
ContextualInformation []string
|
|
|
|
AllowConcurrent bool
|
|
}
|
|
|
|
// Search will search duckduckgo for the given question, and then read the results to figure out the answer.
|
|
// searchQuery is the query that you want to search for, e.g. "what is the capital of France site:reddit.com"
|
|
// question is the question that you are trying to answer when reading the search results.
|
|
// If the context contains a "browser" key that is an extractor.Browser, it will use that browser to search, otherwise a
|
|
// new one will be created and used for the life of this search and then closed.
|
|
func (a Agent) Search(ctx context.Context, searchQuery string, question string) (shared.Knowledge, error) {
|
|
var knowledge = shared.Knowledge{
|
|
OriginalQuestions: []string{question},
|
|
RemainingQuestions: []string{question},
|
|
}
|
|
|
|
browser, ok := ctx.Value("browser").(extractor.Browser)
|
|
if !ok {
|
|
b, err := extractor.NewPlayWrightBrowser(extractor.PlayWrightBrowserOptions{})
|
|
if err != nil {
|
|
return knowledge, err
|
|
}
|
|
|
|
defer deferClose(browser)
|
|
ctx = context.WithValue(ctx, "browser", b)
|
|
browser = b
|
|
}
|
|
|
|
cfg := duckduckgo.Config{
|
|
SafeSearch: duckduckgo.SafeSearchOff,
|
|
Region: "us-en",
|
|
}
|
|
|
|
page, err := cfg.OpenSearch(ctx, browser, searchQuery)
|
|
defer deferClose(page)
|
|
if err != nil {
|
|
return knowledge, err
|
|
}
|
|
|
|
var searchResults []duckduckgo.Result
|
|
|
|
// filterResults will remove any search results that are in oldSearchResults, or are empty
|
|
filterResults := func(in []duckduckgo.Result) []duckduckgo.Result {
|
|
var res []duckduckgo.Result
|
|
for _, r := range in {
|
|
if r.URL == "" {
|
|
continue
|
|
}
|
|
|
|
res = append(res, r)
|
|
}
|
|
|
|
return res
|
|
}
|
|
|
|
_ = page.LoadMore()
|
|
time.Sleep(2 * time.Second)
|
|
|
|
searchResults = filterResults(page.GetResults())
|
|
|
|
var toRead = make(chan int, a.MaxReads)
|
|
|
|
fnReadSearchResult := gollm.NewFunction("read",
|
|
"read the search result and see if it answers the question",
|
|
func(c *gollm.Context, arg struct {
|
|
Num int `description:"The # of the search result to read."`
|
|
}) (string, error) {
|
|
toRead <- arg.Num - 1
|
|
return "ok", nil
|
|
})
|
|
|
|
readSource := func(ctx context.Context, src duckduckgo.Result) (shared.Knowledge, error) {
|
|
r := reader.Agent{
|
|
Model: a.Model,
|
|
ContextualInformation: a.ContextualInformation,
|
|
}
|
|
|
|
u, err := url.Parse(src.URL)
|
|
if err != nil {
|
|
return shared.Knowledge{}, err
|
|
}
|
|
|
|
slog.Info("reading search result", "url", u)
|
|
response, err := r.Read(ctx, question, u)
|
|
if err != nil {
|
|
return shared.Knowledge{}, err
|
|
}
|
|
|
|
return response, nil
|
|
}
|
|
|
|
tools := gollm.NewToolBox(fnReadSearchResult)
|
|
var req = gollm.Request{
|
|
Toolbox: tools,
|
|
}
|
|
|
|
req.Messages = append(req.Messages, gollm.Message{
|
|
Role: gollm.RoleSystem,
|
|
Text: `You are searching DuckDuckGo for the answer to the question that will be posed by the user. The search results will be provided in system messages in the format of: #. "https://url.here" - "Title of Page" - "Description here". For instance:
|
|
1. "https://example.com" - "Example Title" - "This is an example description."
|
|
2. "https://example2.com" - "Example Title 2" - "This is an example description 2."`,
|
|
})
|
|
|
|
if a.MaxReads == 0 {
|
|
a.MaxReads = 100
|
|
}
|
|
req.Messages = append(req.Messages, gollm.Message{
|
|
Role: gollm.RoleSystem,
|
|
Text: fmt.Sprintf(`You can read a search result by using the function "read_search_result" with the # of the page to read, it will attempt to read the page, and then an LLM will read the page and see if it answers the question.
|
|
can call read_search_result multiple times, up to %d times. All sources you read will be evaulated to see if they answer the question in full or at least in part.`, a.MaxReads),
|
|
})
|
|
|
|
if len(a.ContextualInformation) > 0 {
|
|
req.Messages = append(req.Messages, gollm.Message{
|
|
Role: gollm.RoleSystem,
|
|
Text: "Some contextual information you should be aware of: " + strings.Join(a.ContextualInformation, "\n"),
|
|
})
|
|
}
|
|
|
|
searches := ""
|
|
for i, r := range searchResults {
|
|
if i > 0 {
|
|
searches += "\n"
|
|
}
|
|
|
|
searches += fmt.Sprintf("%d. %q - %q - %q", i+1, r.URL, r.Title, r.Description)
|
|
}
|
|
|
|
req.Messages = append(req.Messages, gollm.Message{
|
|
Role: gollm.RoleSystem,
|
|
Text: "Search results are:\n" + searches,
|
|
})
|
|
|
|
results, err := a.Model.ChatComplete(ctx, req)
|
|
if err != nil {
|
|
return knowledge, err
|
|
}
|
|
|
|
if len(results.Choices) == 0 {
|
|
return knowledge, fmt.Errorf("no choices were returned")
|
|
}
|
|
|
|
choice := results.Choices[0]
|
|
|
|
// enforce the maximum number of reads
|
|
calls := choice.Calls
|
|
if len(calls) > a.MaxReads {
|
|
slog.Warn("too many calls, trimming to max", "len", len(calls), "max", a.MaxReads)
|
|
calls = calls[:a.MaxReads]
|
|
}
|
|
|
|
_, err = tools.ExecuteCallbacks(gollm.NewContext(ctx, req, &choice, nil), choice.Calls, nil, nil)
|
|
if err != nil {
|
|
return knowledge, err
|
|
}
|
|
|
|
close(toRead)
|
|
|
|
// make sure there are no duplicates
|
|
var uniques = map[int]struct{}{}
|
|
|
|
for i := range toRead {
|
|
uniques[i] = struct{}{}
|
|
}
|
|
|
|
var sources []duckduckgo.Result
|
|
|
|
for k := range uniques {
|
|
if k < 0 || k >= len(searchResults) {
|
|
slog.Warn("search result index out of range", "index", k, "len", len(searchResults))
|
|
|
|
continue
|
|
}
|
|
sources = append(sources, searchResults[k])
|
|
}
|
|
|
|
type result struct {
|
|
Knowledge shared.Knowledge
|
|
Err error
|
|
}
|
|
|
|
var gainedKnowledge = make(chan result, len(sources))
|
|
|
|
wg := sync.WaitGroup{}
|
|
for _, v := range sources {
|
|
wg.Add(1)
|
|
go func() {
|
|
res, err := readSource(ctx, v)
|
|
slog.Info("read search result", "url", v.URL, "err", err)
|
|
gainedKnowledge <- result{Knowledge: res, Err: err}
|
|
wg.Done()
|
|
}()
|
|
}
|
|
|
|
slog.Info("reading search results", "len", len(sources))
|
|
wg.Wait()
|
|
|
|
close(gainedKnowledge)
|
|
|
|
slog.Info("done reading search results", "len", len(gainedKnowledge))
|
|
|
|
for r := range gainedKnowledge {
|
|
if r.Err != nil {
|
|
slog.Info("error reading search result", "err", r.Err)
|
|
continue
|
|
}
|
|
|
|
knowledge.Knowledge = append(knowledge.Knowledge, r.Knowledge.Knowledge...)
|
|
knowledge.RemainingQuestions = append(knowledge.RemainingQuestions, r.Knowledge.RemainingQuestions...)
|
|
}
|
|
|
|
if a.OnDone != nil {
|
|
err := a.OnDone(ctx, knowledge)
|
|
if err != nil {
|
|
return knowledge, err
|
|
}
|
|
}
|
|
|
|
return knowledge, nil
|
|
}
|