wip, revert changes
This commit is contained in:
parent
7d1d2cba67
commit
6885983576
7 changed files with 103 additions and 144 deletions
|
@ -128,10 +128,10 @@ func isInstanceValid(instance SearXInstance) bool {
|
|||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
instance, err := getRandomSearXInstance()
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to get a SearX instance: %v", err)
|
||||
}
|
||||
fmt.Printf("Selected SearX instance: %s\n", instance.URL)
|
||||
}
|
||||
// func main() {
|
||||
// instance, err := getRandomSearXInstance()
|
||||
// if err != nil {
|
||||
// log.Fatalf("Failed to get a SearX instance: %v", err)
|
||||
// }
|
||||
// fmt.Printf("Selected SearX instance: %s\n", instance.URL)
|
||||
// }
|
||||
|
|
12
go.mod
12
go.mod
|
@ -2,8 +2,18 @@ module searchengine
|
|||
|
||||
go 1.18
|
||||
|
||||
require github.com/PuerkitoBio/goquery v1.9.1 // direct
|
||||
|
||||
require (
|
||||
github.com/PuerkitoBio/goquery v1.9.1 // direct
|
||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
||||
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732 // indirect
|
||||
github.com/chromedp/chromedp v0.9.5 // indirect
|
||||
github.com/chromedp/sysutil v1.0.0 // indirect
|
||||
github.com/gobwas/httphead v0.1.0 // indirect
|
||||
github.com/gobwas/pool v0.2.1 // indirect
|
||||
github.com/gobwas/ws v1.3.2 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/mailru/easyjson v0.7.7 // indirect
|
||||
golang.org/x/net v0.21.0 // indirect
|
||||
golang.org/x/sys v0.17.0 // indirect
|
||||
)
|
||||
|
|
22
go.sum
22
go.sum
|
@ -2,6 +2,24 @@ github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VP
|
|||
github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
|
||||
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
|
||||
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
|
||||
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732 h1:XYUCaZrW8ckGWlCRJKCSoh/iFwlpX316a8yY9IFEzv8=
|
||||
github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
|
||||
github.com/chromedp/chromedp v0.9.5 h1:viASzruPJOiThk7c5bueOUY91jGLJVximoEMGoH93rg=
|
||||
github.com/chromedp/chromedp v0.9.5/go.mod h1:D4I2qONslauw/C7INoCir1BJkSwBYMyZgx8X276z3+Y=
|
||||
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
|
||||
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
|
||||
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
|
||||
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
|
||||
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
|
||||
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
|
||||
github.com/gobwas/ws v1.3.2 h1:zlnbNHxumkRvfPWgfXu8RBwyNR1x8wh9cf5PTOCqs9Q=
|
||||
github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY=
|
||||
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
|
||||
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
|
||||
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
|
||||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
|
@ -23,7 +41,11 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc
|
|||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
// text-duckduckgo.go
|
||||
package main
|
||||
|
||||
import (
|
||||
|
@ -6,66 +7,15 @@ import (
|
|||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
const (
|
||||
resultsPerPage = 10
|
||||
)
|
||||
|
||||
func getVQD(query string) (string, error) {
|
||||
queryURL := fmt.Sprintf("https://duckduckgo.com/?q=%s", url.QueryEscape(query))
|
||||
resp, err := http.Get(queryURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to fetch vqd: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("loading HTML document: %v", err)
|
||||
}
|
||||
|
||||
var vqd string
|
||||
doc.Find("script").Each(func(i int, s *goquery.Selection) {
|
||||
text := s.Text()
|
||||
if strings.Contains(text, "vqd=\"") {
|
||||
start := strings.Index(text, "vqd=\"") + 5
|
||||
end := strings.Index(text[start:], "\"")
|
||||
vqd = text[start : start+end]
|
||||
}
|
||||
})
|
||||
|
||||
if vqd == "" {
|
||||
return "", fmt.Errorf("vqd not found")
|
||||
}
|
||||
|
||||
return vqd, nil
|
||||
}
|
||||
|
||||
func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) {
|
||||
func PerformDuckDuckGoTextSearch(query, safe, lang string) ([]TextSearchResult, error) {
|
||||
var results []TextSearchResult
|
||||
searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s", url.QueryEscape(query))
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
|
||||
vqd, err := getVQD(query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get vqd: %v", err)
|
||||
}
|
||||
|
||||
searchURL := fmt.Sprintf("https://duckduckgo.com/html/?q=%s&kl=%s&safe=%s&s=%d&vqd=%s",
|
||||
url.QueryEscape(query), lang, safe, (page-1)*resultsPerPage, vqd)
|
||||
|
||||
req, err := http.NewRequest("GET", searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %v", err)
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
resp, err := http.Get(searchURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("making request: %v", err)
|
||||
}
|
||||
|
@ -94,34 +44,15 @@ func PerformDuckDuckGoTextSearch(query, safe, lang string, page int) ([]TextSear
|
|||
URL: uddg,
|
||||
Header: strings.TrimSpace(header),
|
||||
Description: strings.TrimSpace(description),
|
||||
Source: "DuckDuckGo",
|
||||
}
|
||||
results = append(results, result)
|
||||
if debugMode {
|
||||
log.Printf("Processed DuckDuckGo result: %+v\n", result)
|
||||
}
|
||||
} else {
|
||||
if debugMode {
|
||||
log.Printf("Missing 'uddg' parameter in URL: %s\n", rawURL)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if debugMode {
|
||||
log.Printf("Error parsing URL: %s, error: %v\n", rawURL, err)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if debugMode {
|
||||
log.Printf("Missing 'href' attribute in result anchor tag\n")
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
if len(results) == 0 {
|
||||
if debugMode {
|
||||
log.Println("No results found from DuckDuckGo")
|
||||
}
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
|
105
text-google.go
105
text-google.go
|
@ -1,61 +1,66 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/chromedp/chromedp"
|
||||
)
|
||||
|
||||
func PerformGoogleTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) {
|
||||
const resultsPerPage = 10
|
||||
// type TextSearchResult struct {
|
||||
// URL string
|
||||
// Header string
|
||||
// Description string
|
||||
// }
|
||||
|
||||
// func main() {
|
||||
// // Example usage
|
||||
// results, err := PerformGoogleTextSearch("golang", "off", "lang_en", 2)
|
||||
// if err != nil {
|
||||
// log.Fatalf("Error performing search: %v", err)
|
||||
// }
|
||||
|
||||
// for _, result := range results {
|
||||
// fmt.Printf("URL: %s\nHeader: %s\nDescription: %s\n", result.URL, result.Header, result.Description)
|
||||
// }
|
||||
// }
|
||||
|
||||
func PerformGoogleTextSearch(query, safe, lang string, numPages int) ([]TextSearchResult, error) {
|
||||
ctx, cancel := chromedp.NewContext(context.Background())
|
||||
defer cancel()
|
||||
|
||||
var results []TextSearchResult
|
||||
|
||||
client := &http.Client{}
|
||||
searchURL := buildSearchURL(query, safe, lang, page, resultsPerPage)
|
||||
searchURL := buildSearchURL(query, safe, lang, 1, 10)
|
||||
|
||||
err := chromedp.Run(ctx,
|
||||
chromedp.Navigate(searchURL),
|
||||
)
|
||||
|
||||
req, err := http.NewRequest("GET", searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %v", err)
|
||||
return nil, fmt.Errorf("failed to navigate to search URL: %v", err)
|
||||
}
|
||||
|
||||
// User Agent generation
|
||||
TextUserAgent, err := GetUserAgent("Text-Search")
|
||||
if err != nil {
|
||||
fmt.Println("Error:", err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if debugMode {
|
||||
fmt.Println("Generated User Agent (text):", TextUserAgent)
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", TextUserAgent)
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("making request: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("loading HTML document: %v", err)
|
||||
}
|
||||
|
||||
results = parseResults(doc)
|
||||
|
||||
if len(results) == 0 {
|
||||
if debugMode {
|
||||
log.Println("No results found from Google")
|
||||
for page := 1; page <= numPages; page++ {
|
||||
var pageSource string
|
||||
err := chromedp.Run(ctx,
|
||||
chromedp.Sleep(2*time.Second),
|
||||
chromedp.OuterHTML("html", &pageSource),
|
||||
chromedp.Evaluate(`window.scrollTo(0, document.body.scrollHeight);`, nil),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to retrieve page source: %v", err)
|
||||
}
|
||||
|
||||
newResults, err := parseResults(pageSource)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing results: %v", err)
|
||||
}
|
||||
results = append(results, newResults...)
|
||||
}
|
||||
|
||||
return results, nil
|
||||
|
@ -72,20 +77,21 @@ func buildSearchURL(query, safe, lang string, page, resultsPerPage int) string {
|
|||
langParam = "&lr=" + lang
|
||||
}
|
||||
|
||||
startIndex := (page - 1) * resultsPerPage
|
||||
return fmt.Sprintf("https://www.google.com/search?q=%s%s%s&udm=14&start=%d", url.QueryEscape(query), safeParam, langParam, startIndex)
|
||||
return fmt.Sprintf("https://www.google.com/search?q=%s%s%s", url.QueryEscape(query), safeParam, langParam)
|
||||
}
|
||||
|
||||
func parseResults(doc *goquery.Document) []TextSearchResult {
|
||||
func parseResults(pageSource string) ([]TextSearchResult, error) {
|
||||
var results []TextSearchResult
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(pageSource))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("loading HTML document: %v", err)
|
||||
}
|
||||
|
||||
doc.Find(".yuRUbf").Each(func(i int, s *goquery.Selection) {
|
||||
link := s.Find("a")
|
||||
href, exists := link.Attr("href")
|
||||
if !exists {
|
||||
if debugMode {
|
||||
log.Printf("No href attribute found for result %d\n", i)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -104,10 +110,7 @@ func parseResults(doc *goquery.Document) []TextSearchResult {
|
|||
Description: description,
|
||||
}
|
||||
results = append(results, result)
|
||||
if debugMode {
|
||||
log.Printf("Google result: %+v\n", result)
|
||||
}
|
||||
})
|
||||
|
||||
return results
|
||||
return results, nil
|
||||
}
|
||||
|
|
|
@ -3,7 +3,6 @@ package main
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"time"
|
||||
|
@ -27,11 +26,9 @@ type QwantTextAPIResponse struct {
|
|||
}
|
||||
|
||||
// PerformQwantTextSearch contacts the Qwant API and returns a slice of TextSearchResult
|
||||
func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchResult, error) {
|
||||
func PerformQwantTextSearch(query, safe, lang string) ([]TextSearchResult, error) {
|
||||
const resultsPerPage = 10
|
||||
|
||||
// Calculate the offset based on the page number
|
||||
offset := (page - 1) * resultsPerPage
|
||||
const offset = 0
|
||||
|
||||
// Ensure safe search is disabled by default if not specified
|
||||
if safe == "" {
|
||||
|
@ -43,12 +40,11 @@ func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchRes
|
|||
lang = "en_CA"
|
||||
}
|
||||
|
||||
apiURL := fmt.Sprintf("https://api.qwant.com/v3/search/web?q=%s&count=%d&locale=%s&offset=%d&device=desktop&safesearch=%s",
|
||||
apiURL := fmt.Sprintf("https://api.qwant.com/v3/search/web?q=%s&count=%d&locale=%s&offset=%d&device=desktop",
|
||||
url.QueryEscape(query),
|
||||
resultsPerPage,
|
||||
lang,
|
||||
offset,
|
||||
safe)
|
||||
offset)
|
||||
|
||||
client := &http.Client{Timeout: 10 * time.Second}
|
||||
|
||||
|
@ -97,9 +93,6 @@ func PerformQwantTextSearch(query, safe, lang string, page int) ([]TextSearchRes
|
|||
func cleanQwantURL(rawURL string) string {
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
if debugMode {
|
||||
log.Printf("Error parsing URL: %v", err)
|
||||
}
|
||||
return rawURL
|
||||
}
|
||||
return u.Scheme + "://" + u.Host + u.Path
|
||||
|
|
2
text.go
2
text.go
|
@ -135,7 +135,7 @@ func fetchAndCacheTextResults(query, safe, lang string, page, resultsPerPage int
|
|||
Source string
|
||||
}{
|
||||
{PerformGoogleTextSearch, "Google"},
|
||||
{PerformLibreXTextSearch, "LibreX"},
|
||||
// {PerformLibreXTextSearch, "LibreX"},
|
||||
// {PerformSearXNGTextSearch, "SearXNG"},
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue