From a86b370f6901bb6356e05da840ddac7ea744d9b9 Mon Sep 17 00:00:00 2001 From: partisan Date: Sun, 9 Jun 2024 21:44:49 +0200 Subject: [PATCH] fix for pages --- run.sh | 2 +- text-duckduckgo.go | 12 +++- text-google.go | 62 ++++++---------- text-librex.go | 6 +- text.go | 174 +++++++++++++++++---------------------------- 5 files changed, 99 insertions(+), 157 deletions(-) diff --git a/run.sh b/run.sh index 9b6d4d8..aa4f4f3 100755 --- a/run.sh +++ b/run.sh @@ -1,3 +1,3 @@ #!/bin/bash -go run main.go images.go imageproxy.go video.go map.go text.go text-searchxng.go text-librex.go text-google.go cache.go forums.go files.go files-torrentgalaxy.go files-thepiratebay.go agent.go --debug \ No newline at end of file +go run main.go images.go imageproxy.go video.go map.go text.go text-searchxng.go text-librex.go text-google.go cache.go forums.go files.go files-torrentgalaxy.go files-thepiratebay.go agent.go \ No newline at end of file diff --git a/text-duckduckgo.go b/text-duckduckgo.go index d003895..56d098f 100644 --- a/text-duckduckgo.go +++ b/text-duckduckgo.go @@ -11,9 +11,9 @@ import ( "github.com/PuerkitoBio/goquery" ) -func PerformDuckDuckGoTextSearch(query, safe, lang string) ([]TextSearchResult, error) { +func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { var results []TextSearchResult - searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s", url.QueryEscape(query)) + searchURL := buildDuckDuckGoSearchURL(query, page) resp, err := http.Get(searchURL) if err != nil { @@ -56,3 +56,11 @@ func PerformDuckDuckGoTextSearch(query, safe, lang string) ([]TextSearchResult, return results, nil } + +func buildDuckDuckGoSearchURL(query string, page int) string { + startParam := "" + if page > 1 { + startParam = fmt.Sprintf("&s=%d", (page-1)*10) + } + return fmt.Sprintf("https://duckduckgo.com/html/?q=%s%s", url.QueryEscape(query), startParam) +} \ No newline at end of file diff --git a/text-google.go b/text-google.go index c69c5ba..971c407 100644 --- a/text-google.go +++ b/text-google.go @@ -11,57 +11,37 @@ import ( "github.com/chromedp/chromedp" ) -// type TextSearchResult struct { -// URL string -// Header string -// Description string -// } +func PerformGoogleTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { + opts := append(chromedp.DefaultExecAllocatorOptions[:], + chromedp.DisableGPU, + chromedp.NoDefaultBrowserCheck, + chromedp.NoFirstRun, + chromedp.Flag("disable-javascript", true), + ) + ctx, cancel := chromedp.NewExecAllocator(context.Background(), opts...) + defer cancel() -// func main() { -// // Example usage -// results, err := PerformGoogleTextSearch("golang", "off", "lang_en", 2) -// if err != nil { -// log.Fatalf("Error performing search: %v", err) -// } - -// for _, result := range results { -// fmt.Printf("URL: %s\nHeader: %s\nDescription: %s\n", result.URL, result.Header, result.Description) -// } -// } - -func PerformGoogleTextSearch(query, safe, lang string, numPages int) ([]TextSearchResult, error) { - ctx, cancel := chromedp.NewContext(context.Background()) + ctx, cancel = chromedp.NewContext(ctx) defer cancel() var results []TextSearchResult - searchURL := buildSearchURL(query, safe, lang, 1, 10) - + searchURL := buildSearchURL(query, safe, lang, page, 10) + var pageSource string err := chromedp.Run(ctx, chromedp.Navigate(searchURL), + chromedp.Sleep(2*time.Second), + chromedp.OuterHTML("html", &pageSource), ) - if err != nil { - return nil, fmt.Errorf("failed to navigate to search URL: %v", err) + return nil, fmt.Errorf("failed to retrieve page source: %v", err) } - for page := 1; page <= numPages; page++ { - var pageSource string - err := chromedp.Run(ctx, - chromedp.Sleep(2*time.Second), - chromedp.OuterHTML("html", &pageSource), - chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil), - ) - if err != nil { - return nil, fmt.Errorf("failed to retrieve page source: %v", err) - } - - newResults, err := parseResults(pageSource) - if err != nil { - return nil, fmt.Errorf("error parsing results: %v", err) - } - results = append(results, newResults...) + newResults, err := parseResults(pageSource) + if err != nil { + return nil, fmt.Errorf("error parsing results: %v", err) } + results = append(results, newResults...) return results, nil } @@ -77,7 +57,9 @@ func buildSearchURL(query, safe, lang string, page, resultsPerPage int) string { langParam = "&lr=" + lang } - return fmt.Sprintf("https://www.google.com/search?q=%s%s%s", url.QueryEscape(query), safeParam, langParam) + startParam := fmt.Sprintf("&start=%d", (page-1)*resultsPerPage) + + return fmt.Sprintf("https://www.google.com/search?q=%s%s%s%s", url.QueryEscape(query), safeParam, langParam, startParam) } func parseResults(pageSource string) ([]TextSearchResult, error) { diff --git a/text-librex.go b/text-librex.go index 450f20d..526d7e8 100644 --- a/text-librex.go +++ b/text-librex.go @@ -20,7 +20,7 @@ type LibreXResponse []LibreXResult func PerformLibreXTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { // LibreX uses page starting from 0 - searchURL := fmt.Sprintf("https://%s/api.php?q=%s&p=%d&t=0", LIBREX_DOMAIN, url.QueryEscape(query), page-1) + searchURL := fmt.Sprintf("https://%s/api.php?q=%s&p=%d&t=0", LIBREX_DOMAIN, url.QueryEscape(query), page) // User Agent generation userAgent, err := GetUserAgent("librex-text-search") @@ -63,10 +63,6 @@ func PerformLibreXTextSearch(query, safe, lang string, page int) ([]TextSearchRe Source: "LibreX", } - if debugMode { - log.Printf("LibreX result: %+v\n", result) - } - results = append(results, result) } diff --git a/text.go b/text.go index 6d7c727..ecf5d73 100644 --- a/text.go +++ b/text.go @@ -1,44 +1,59 @@ package main import ( - "flag" "fmt" "html/template" "log" + "math/rand" "net/http" - "sort" "sync" "time" ) var ( - debugMode bool + debugMode bool + searchEngines []searchEngine + searchEngineLock sync.Mutex ) +type searchEngine struct { + Name string + Func func(string, string, string, int) ([]TextSearchResult, error) + Weight int +} + func init() { - flag.BoolVar(&debugMode, "debug", false, "enable debug mode") - flag.Parse() + debugMode = false + + searchEngines = []searchEngine{ + {Name: "Google", Func: PerformGoogleTextSearch, Weight: 1}, + {Name: "LibreX", Func: PerformLibreXTextSearch, Weight: 2}, + // {Name: "DuckDuckGo", Func: PerformDuckDuckGoTextSearch, Weight: 3}, // DuckDuckGo timeouts too fast and search results are trash + // {Name: "SearXNG", Func: PerformSearXNGTextSearch, Weight: 2}, // Uncomment when implemented + } + + rand.Seed(time.Now().UnixNano()) } func HandleTextSearch(w http.ResponseWriter, query, safe, lang string, page int) { startTime := time.Now() - const resultsPerPage = 10 cacheKey := CacheKey{Query: query, Page: page, Safe: safe == "true", Lang: lang, Type: "text"} - combinedResults := getTextResultsFromCacheOrFetch(cacheKey, query, safe, lang, page, resultsPerPage) + combinedResults := getTextResultsFromCacheOrFetch(cacheKey, query, safe, lang, page) hasPrevPage := page > 1 - hasNextPage := len(combinedResults) == resultsPerPage + hasNextPage := len(combinedResults) > 0 displayResults(w, combinedResults, query, lang, time.Since(startTime).Seconds(), page, hasPrevPage, hasNextPage) - // Always check and cache the next page if not enough results - if hasNextPage { - go cacheNextPageIfNotCached(query, safe, lang, page+1, resultsPerPage) + // Prefetch next and previous pages + go prefetchPage(query, safe, lang, page+1) + if hasPrevPage { + go prefetchPage(query, safe, lang, page-1) } } -func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, page, resultsPerPage int) []TextSearchResult { +func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, page int) []TextSearchResult { cacheChan := make(chan []SearchResult) var combinedResults []TextSearchResult @@ -56,7 +71,7 @@ func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, select { case results := <-cacheChan: if results == nil { - combinedResults = fetchTextResultsUntilFull(query, safe, lang, page, resultsPerPage) + combinedResults = fetchTextResults(query, safe, lang, page) resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) } else { textResults, _, _ := convertToSpecificResults(results) @@ -64,129 +79,70 @@ func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, } case <-time.After(2 * time.Second): log.Println("Cache check timeout") - combinedResults = fetchTextResultsUntilFull(query, safe, lang, page, resultsPerPage) + combinedResults = fetchTextResults(query, safe, lang, page) resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) } return combinedResults } -func cacheNextPageIfNotCached(query, safe, lang string, page, resultsPerPage int) { +func prefetchPage(query, safe, lang string, page int) { cacheKey := CacheKey{Query: query, Page: page, Safe: safe == "true", Lang: lang, Type: "text"} if _, exists := resultsCache.Get(cacheKey); !exists { - log.Printf("Next page %d not cached, caching now...", page) - nextPageResults := fetchTextResultsUntilFull(query, safe, lang, page, resultsPerPage) - resultsCache.Set(cacheKey, convertToSearchResults(nextPageResults)) + log.Printf("Page %d not cached, caching now...", page) + pageResults := fetchTextResults(query, safe, lang, page) + resultsCache.Set(cacheKey, convertToSearchResults(pageResults)) } else { - log.Printf("Next page %d already cached", page) + log.Printf("Page %d already cached", page) } } -func fetchTextResultsUntilFull(query, safe, lang string, targetPage, resultsPerPage int) []TextSearchResult { - var combinedResults []TextSearchResult - currentPage := 1 - resultsNeeded := targetPage * resultsPerPage +func fetchTextResults(query, safe, lang string, page int) []TextSearchResult { + engine := selectSearchEngine() + log.Printf("Using search engine: %s", engine.Name) - for len(combinedResults) < resultsNeeded { - cacheKey := CacheKey{Query: query, Page: targetPage, Safe: safe == "true", Lang: lang, Type: "text"} - cachedResults, exists := resultsCache.Get(cacheKey) - if exists { - textResults, _, _ := convertToSpecificResults(cachedResults) - combinedResults = append(combinedResults, textResults...) - } else { - results := fetchAndCacheTextResults(query, safe, lang, currentPage, resultsPerPage) - if len(results) == 0 { - break - } - combinedResults = append(combinedResults, results...) - resultsCache.Set(cacheKey, convertToSearchResults(results)) - } - - currentPage++ - - // Stop fetching if we have enough results for the target page and the next page - if len(combinedResults) >= resultsNeeded+resultsPerPage { - break - } + results, err := engine.Func(query, safe, lang, page) + if err != nil { + log.Printf("Error performing search with %s: %v", engine.Name, err) + return nil } - startIndex := (targetPage - 1) * resultsPerPage - endIndex := startIndex + resultsPerPage - - if startIndex >= len(combinedResults) { - return []TextSearchResult{} - } - if endIndex > len(combinedResults) { - endIndex = len(combinedResults) - } - - return combinedResults[startIndex:endIndex] + return results } -func fetchAndCacheTextResults(query, safe, lang string, page, resultsPerPage int) []TextSearchResult { - var combinedResults []TextSearchResult - var wg sync.WaitGroup - var mu sync.Mutex +func selectSearchEngine() searchEngine { + searchEngineLock.Lock() + defer searchEngineLock.Unlock() - resultsChan := make(chan []TextSearchResult) - - searchFuncs := []struct { - Func func(string, string, string, int) ([]TextSearchResult, error) - Source string - }{ - {PerformGoogleTextSearch, "Google"}, - // {PerformLibreXTextSearch, "LibreX"}, - // {PerformSearXNGTextSearch, "SearXNG"}, + totalWeight := 0 + for _, engine := range searchEngines { + totalWeight += engine.Weight } - wg.Add(len(searchFuncs)) - - for _, searchFunc := range searchFuncs { - go func(searchFunc func(string, string, string, int) ([]TextSearchResult, error), source string) { - defer wg.Done() - results, err := searchFunc(query, safe, lang, page) - if err == nil { - for i := range results { - results[i].Source = source + randValue := rand.Intn(totalWeight) + for _, engine := range searchEngines { + if randValue < engine.Weight { + // Adjust weights for load balancing + for i := range searchEngines { + if searchEngines[i].Name == engine.Name { + searchEngines[i].Weight = max(1, searchEngines[i].Weight-1) + } else { + searchEngines[i].Weight++ } - resultsChan <- results - } else { - log.Printf("Error performing search from %s: %v", source, err) } - }(searchFunc.Func, searchFunc.Source) + return engine + } + randValue -= engine.Weight } - go func() { - wg.Wait() - close(resultsChan) - }() - - for results := range resultsChan { - mu.Lock() - combinedResults = append(combinedResults, results...) - mu.Unlock() - } - - sort.SliceStable(combinedResults, func(i, j int) bool { - return sourceOrder(combinedResults[i].Source) < sourceOrder(combinedResults[j].Source) - }) - - log.Printf("Fetched %d results for page %d", len(combinedResults), page) - - return combinedResults + return searchEngines[0] // fallback to the first engine } -func sourceOrder(source string) int { - switch source { - case "Google": - return 1 - case "LibreX": - return 2 - case "SearchXNG": - return 3 - default: - return 4 +func max(a, b int) int { + if a > b { + return a } + return b } func displayResults(w http.ResponseWriter, results []TextSearchResult, query, lang string, elapsed float64, page int, hasPrevPage, hasNextPage bool) {