fix for pages

This commit is contained in:
partisan 2024-06-09 21:44:49 +02:00
parent 6885983576
commit a86b370f69
5 changed files with 99 additions and 157 deletions

2
run.sh
View file

@ -1,3 +1,3 @@
#!/bin/bash #!/bin/bash
go run main.go images.go imageproxy.go video.go map.go text.go text-searchxng.go text-librex.go text-google.go cache.go forums.go files.go files-torrentgalaxy.go files-thepiratebay.go agent.go --debug go run main.go images.go imageproxy.go video.go map.go text.go text-searchxng.go text-librex.go text-google.go cache.go forums.go files.go files-torrentgalaxy.go files-thepiratebay.go agent.go

View file

@ -11,9 +11,9 @@ import (
"github.com/PuerkitoBio/goquery" "github.com/PuerkitoBio/goquery"
) )
func PerformDuckDuckGoTextSearch(query, safe, lang string) ([]TextSearchResult, error) { func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) {
var results []TextSearchResult var results []TextSearchResult
searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s", url.QueryEscape(query)) searchURL := buildDuckDuckGoSearchURL(query, page)
resp, err := http.Get(searchURL) resp, err := http.Get(searchURL)
if err != nil { if err != nil {
@ -56,3 +56,11 @@ func PerformDuckDuckGoTextSearch(query, safe, lang string) ([]TextSearchResult,
return results, nil return results, nil
} }
func buildDuckDuckGoSearchURL(query string, page int) string {
startParam := ""
if page > 1 {
startParam = fmt.Sprintf("&s=%d", (page-1)*10)
}
return fmt.Sprintf("https://duckduckgo.com/html/?q=%s%s", url.QueryEscape(query), startParam)
}

View file

@ -11,46 +11,27 @@ import (
"github.com/chromedp/chromedp" "github.com/chromedp/chromedp"
) )
// type TextSearchResult struct { func PerformGoogleTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) {
// URL string opts := append(chromedp.DefaultExecAllocatorOptions[:],
// Header string chromedp.DisableGPU,
// Description string chromedp.NoDefaultBrowserCheck,
// } chromedp.NoFirstRun,
chromedp.Flag("disable-javascript", true),
)
ctx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel()
// func main() { ctx, cancel = chromedp.NewContext(ctx)
// // Example usage
// results, err := PerformGoogleTextSearch("golang", "off", "lang_en", 2)
// if err != nil {
// log.Fatalf("Error performing search: %v", err)
// }
// for _, result := range results {
// fmt.Printf("URL: %s\nHeader: %s\nDescription: %s\n", result.URL, result.Header, result.Description)
// }
// }
func PerformGoogleTextSearch(query, safe, lang string, numPages int) ([]TextSearchResult, error) {
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel() defer cancel()
var results []TextSearchResult var results []TextSearchResult
searchURL := buildSearchURL(query, safe, lang, 1, 10) searchURL := buildSearchURL(query, safe, lang, page, 10)
err := chromedp.Run(ctx,
chromedp.Navigate(searchURL),
)
if err != nil {
return nil, fmt.Errorf("failed to navigate to search URL: %v", err)
}
for page := 1; page <= numPages; page++ {
var pageSource string var pageSource string
err := chromedp.Run(ctx, err := chromedp.Run(ctx,
chromedp.Navigate(searchURL),
chromedp.Sleep(2*time.Second), chromedp.Sleep(2*time.Second),
chromedp.OuterHTML("html", &pageSource), chromedp.OuterHTML("html", &pageSource),
chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil),
) )
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to retrieve page source: %v", err) return nil, fmt.Errorf("failed to retrieve page source: %v", err)
@ -61,7 +42,6 @@ func PerformGoogleTextSearch(query, safe, lang string, numPages int) ([]TextSear
return nil, fmt.Errorf("error parsing results: %v", err) return nil, fmt.Errorf("error parsing results: %v", err)
} }
results = append(results, newResults...) results = append(results, newResults...)
}
return results, nil return results, nil
} }
@ -77,7 +57,9 @@ func buildSearchURL(query, safe, lang string, page, resultsPerPage int) string {
langParam = "&lr=" + lang langParam = "&lr=" + lang
} }
return fmt.Sprintf("https://www.google.com/search?q=%s%s%s", url.QueryEscape(query), safeParam, langParam) startParam := fmt.Sprintf("&start=%d", (page-1)*resultsPerPage)
return fmt.Sprintf("https://www.google.com/search?q=%s%s%s%s", url.QueryEscape(query), safeParam, langParam, startParam)
} }
func parseResults(pageSource string) ([]TextSearchResult, error) { func parseResults(pageSource string) ([]TextSearchResult, error) {

View file

@ -20,7 +20,7 @@ type LibreXResponse []LibreXResult
func PerformLibreXTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) { func PerformLibreXTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) {
// LibreX uses page starting from 0 // LibreX uses page starting from 0
searchURL := fmt.Sprintf("https://%s/api.php?q=%s&p=%d&t=0", LIBREX_DOMAIN, url.QueryEscape(query), page-1) searchURL := fmt.Sprintf("https://%s/api.php?q=%s&p=%d&t=0", LIBREX_DOMAIN, url.QueryEscape(query), page)
// User Agent generation // User Agent generation
userAgent, err := GetUserAgent("librex-text-search") userAgent, err := GetUserAgent("librex-text-search")
@ -63,10 +63,6 @@ func PerformLibreXTextSearch(query, safe, lang string, page int) ([]TextSearchRe
Source: "LibreX", Source: "LibreX",
} }
if debugMode {
log.Printf("LibreX result: %+v\n", result)
}
results = append(results, result) results = append(results, result)
} }

182
text.go
View file

@ -1,44 +1,59 @@
package main package main
import ( import (
"flag"
"fmt" "fmt"
"html/template" "html/template"
"log" "log"
"math/rand"
"net/http" "net/http"
"sort"
"sync" "sync"
"time" "time"
) )
var ( var (
debugMode bool debugMode bool
searchEngines []searchEngine
searchEngineLock sync.Mutex
) )
type searchEngine struct {
Name string
Func func(string, string, string, int) ([]TextSearchResult, error)
Weight int
}
func init() { func init() {
flag.BoolVar(&debugMode, "debug", false, "enable debug mode") debugMode = false
flag.Parse()
searchEngines = []searchEngine{
{Name: "Google", Func: PerformGoogleTextSearch, Weight: 1},
{Name: "LibreX", Func: PerformLibreXTextSearch, Weight: 2},
// {Name: "DuckDuckGo", Func: PerformDuckDuckGoTextSearch, Weight: 3}, // DuckDuckGo timeouts too fast and search results are trash
// {Name: "SearXNG", Func: PerformSearXNGTextSearch, Weight: 2}, // Uncomment when implemented
}
rand.Seed(time.Now().UnixNano())
} }
func HandleTextSearch(w http.ResponseWriter, query, safe, lang string, page int) { func HandleTextSearch(w http.ResponseWriter, query, safe, lang string, page int) {
startTime := time.Now() startTime := time.Now()
const resultsPerPage = 10
cacheKey := CacheKey{Query: query, Page: page, Safe: safe == "true", Lang: lang, Type: "text"} cacheKey := CacheKey{Query: query, Page: page, Safe: safe == "true", Lang: lang, Type: "text"}
combinedResults := getTextResultsFromCacheOrFetch(cacheKey, query, safe, lang, page, resultsPerPage) combinedResults := getTextResultsFromCacheOrFetch(cacheKey, query, safe, lang, page)
hasPrevPage := page > 1 hasPrevPage := page > 1
hasNextPage := len(combinedResults) == resultsPerPage hasNextPage := len(combinedResults) > 0
displayResults(w, combinedResults, query, lang, time.Since(startTime).Seconds(), page, hasPrevPage, hasNextPage) displayResults(w, combinedResults, query, lang, time.Since(startTime).Seconds(), page, hasPrevPage, hasNextPage)
// Always check and cache the next page if not enough results // Prefetch next and previous pages
if hasNextPage { go prefetchPage(query, safe, lang, page+1)
go cacheNextPageIfNotCached(query, safe, lang, page+1, resultsPerPage) if hasPrevPage {
go prefetchPage(query, safe, lang, page-1)
} }
} }
func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, page, resultsPerPage int) []TextSearchResult { func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, page int) []TextSearchResult {
cacheChan := make(chan []SearchResult) cacheChan := make(chan []SearchResult)
var combinedResults []TextSearchResult var combinedResults []TextSearchResult
@ -56,7 +71,7 @@ func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string,
select { select {
case results := <-cacheChan: case results := <-cacheChan:
if results == nil { if results == nil {
combinedResults = fetchTextResultsUntilFull(query, safe, lang, page, resultsPerPage) combinedResults = fetchTextResults(query, safe, lang, page)
resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) resultsCache.Set(cacheKey, convertToSearchResults(combinedResults))
} else { } else {
textResults, _, _ := convertToSpecificResults(results) textResults, _, _ := convertToSpecificResults(results)
@ -64,129 +79,70 @@ func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string,
} }
case <-time.After(2 * time.Second): case <-time.After(2 * time.Second):
log.Println("Cache check timeout") log.Println("Cache check timeout")
combinedResults = fetchTextResultsUntilFull(query, safe, lang, page, resultsPerPage) combinedResults = fetchTextResults(query, safe, lang, page)
resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) resultsCache.Set(cacheKey, convertToSearchResults(combinedResults))
} }
return combinedResults return combinedResults
} }
func cacheNextPageIfNotCached(query, safe, lang string, page, resultsPerPage int) { func prefetchPage(query, safe, lang string, page int) {
cacheKey := CacheKey{Query: query, Page: page, Safe: safe == "true", Lang: lang, Type: "text"} cacheKey := CacheKey{Query: query, Page: page, Safe: safe == "true", Lang: lang, Type: "text"}
if _, exists := resultsCache.Get(cacheKey); !exists { if _, exists := resultsCache.Get(cacheKey); !exists {
log.Printf("Next page %d not cached, caching now...", page) log.Printf("Page %d not cached, caching now...", page)
nextPageResults := fetchTextResultsUntilFull(query, safe, lang, page, resultsPerPage) pageResults := fetchTextResults(query, safe, lang, page)
resultsCache.Set(cacheKey, convertToSearchResults(nextPageResults)) resultsCache.Set(cacheKey, convertToSearchResults(pageResults))
} else { } else {
log.Printf("Next page %d already cached", page) log.Printf("Page %d already cached", page)
} }
} }
func fetchTextResultsUntilFull(query, safe, lang string, targetPage, resultsPerPage int) []TextSearchResult { func fetchTextResults(query, safe, lang string, page int) []TextSearchResult {
var combinedResults []TextSearchResult engine := selectSearchEngine()
currentPage := 1 log.Printf("Using search engine: %s", engine.Name)
resultsNeeded := targetPage * resultsPerPage
for len(combinedResults) < resultsNeeded { results, err := engine.Func(query, safe, lang, page)
cacheKey := CacheKey{Query: query, Page: targetPage, Safe: safe == "true", Lang: lang, Type: "text"} if err != nil {
cachedResults, exists := resultsCache.Get(cacheKey) log.Printf("Error performing search with %s: %v", engine.Name, err)
if exists { return nil
textResults, _, _ := convertToSpecificResults(cachedResults) }
combinedResults = append(combinedResults, textResults...)
return results
}
func selectSearchEngine() searchEngine {
searchEngineLock.Lock()
defer searchEngineLock.Unlock()
totalWeight := 0
for _, engine := range searchEngines {
totalWeight += engine.Weight
}
randValue := rand.Intn(totalWeight)
for _, engine := range searchEngines {
if randValue < engine.Weight {
// Adjust weights for load balancing
for i := range searchEngines {
if searchEngines[i].Name == engine.Name {
searchEngines[i].Weight = max(1, searchEngines[i].Weight-1)
} else { } else {
results := fetchAndCacheTextResults(query, safe, lang, currentPage, resultsPerPage) searchEngines[i].Weight++
if len(results) == 0 {
break
}
combinedResults = append(combinedResults, results...)
resultsCache.Set(cacheKey, convertToSearchResults(results))
}
currentPage++
// Stop fetching if we have enough results for the target page and the next page
if len(combinedResults) >= resultsNeeded+resultsPerPage {
break
} }
} }
return engine
startIndex := (targetPage - 1) * resultsPerPage
endIndex := startIndex + resultsPerPage
if startIndex >= len(combinedResults) {
return []TextSearchResult{}
} }
if endIndex > len(combinedResults) { randValue -= engine.Weight
endIndex = len(combinedResults)
} }
return combinedResults[startIndex:endIndex] return searchEngines[0] // fallback to the first engine
} }
func fetchAndCacheTextResults(query, safe, lang string, page, resultsPerPage int) []TextSearchResult { func max(a, b int) int {
var combinedResults []TextSearchResult if a > b {
var wg sync.WaitGroup return a
var mu sync.Mutex
resultsChan := make(chan []TextSearchResult)
searchFuncs := []struct {
Func func(string, string, string, int) ([]TextSearchResult, error)
Source string
}{
{PerformGoogleTextSearch, "Google"},
// {PerformLibreXTextSearch, "LibreX"},
// {PerformSearXNGTextSearch, "SearXNG"},
}
wg.Add(len(searchFuncs))
for _, searchFunc := range searchFuncs {
go func(searchFunc func(string, string, string, int) ([]TextSearchResult, error), source string) {
defer wg.Done()
results, err := searchFunc(query, safe, lang, page)
if err == nil {
for i := range results {
results[i].Source = source
}
resultsChan <- results
} else {
log.Printf("Error performing search from %s: %v", source, err)
}
}(searchFunc.Func, searchFunc.Source)
}
go func() {
wg.Wait()
close(resultsChan)
}()
for results := range resultsChan {
mu.Lock()
combinedResults = append(combinedResults, results...)
mu.Unlock()
}
sort.SliceStable(combinedResults, func(i, j int) bool {
return sourceOrder(combinedResults[i].Source) < sourceOrder(combinedResults[j].Source)
})
log.Printf("Fetched %d results for page %d", len(combinedResults), page)
return combinedResults
}
func sourceOrder(source string) int {
switch source {
case "Google":
return 1
case "LibreX":
return 2
case "SearchXNG":
return 3
default:
return 4
} }
return b
} }
func displayResults(w http.ResponseWriter, results []TextSearchResult, query, lang string, elapsed float64, page int, hasPrevPage, hasNextPage bool) { func displayResults(w http.ResponseWriter, results []TextSearchResult, query, lang string, elapsed float64, page int, hasPrevPage, hasNextPage bool) {