From a9a6948a44254008b03c7b1fd869c370e7541f36 Mon Sep 17 00:00:00 2001 From: partisan Date: Tue, 31 Dec 2024 02:44:14 +0100 Subject: [PATCH] updated indexing & user agent generator --- agent.go | 43 ++++++++++++++++++++++++++------------ indexer.go | 60 ++++++++++++++++++++++++++++++++++-------------------- init.go | 5 +++++ 3 files changed, 73 insertions(+), 35 deletions(-) diff --git a/agent.go b/agent.go index 296b4e4..6333102 100755 --- a/agent.go +++ b/agent.go @@ -3,7 +3,7 @@ package main import ( "encoding/json" "fmt" - "io/ioutil" + "io" "math/rand" "net/http" "sort" @@ -40,13 +40,33 @@ var ( func fetchLatestBrowserVersions() (BrowserData, error) { url := "https://raw.githubusercontent.com/Fyrd/caniuse/master/fulldata-json/data-2.0.json" - resp, err := http.Get(url) + // // Optional: skip TLS verification to avoid certificate errors + // transport := &http.Transport{ + // TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + // } + + // Increase the HTTP client timeout + client := &http.Client{ + Timeout: 30 * time.Second, + // Transport: transport, + } + + // Build the request manually to set headers + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return BrowserData{}, err + } + // Custom user agent and English language preference + req.Header.Set("User-Agent", "MyCustomAgent/1.0 (compatible; +https://example.com)") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + + resp, err := client.Do(req) if err != nil { return BrowserData{}, err } defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return BrowserData{}, err } @@ -109,7 +129,7 @@ func randomUserAgent() (string, error) { return "", err } - rand.Seed(time.Now().UnixNano()) + rand := rand.New(rand.NewSource(time.Now().UnixNano())) // Simulated browser usage statistics (in percentages) usageStats := map[string]float64{ @@ -161,6 +181,7 @@ func randomUserAgent() (string, error) { } } + // Fallback to the last version if none matched if version == "" { version = versions[len(versions)-1].Version } @@ -240,11 +261,11 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string { browserType = "Firefox" } - // Get the latest version for the browser type + // Get the latest version for that browser var latestVersion string - if browserType == "Firefox" { + if browserType == "Firefox" && len(newVersions.Firefox) > 0 { latestVersion = newVersions.Firefox[0].Version - } else if browserType == "Chromium" { + } else if browserType == "Chromium" && len(newVersions.Chromium) > 0 { latestVersion = newVersions.Chromium[0].Version } @@ -252,7 +273,7 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string { return generateUserAgent(browserType, latestVersion) } -func periodicUpdate() { +func periodicAgentUpdate() { for { // Sleep for a random interval between 1 and 2 days time.Sleep(time.Duration(24+rand.Intn(24)) * time.Hour) @@ -309,12 +330,8 @@ func GetNewUserAgent(cacheKey string) (string, error) { return userAgent, nil } -func init() { - go periodicUpdate() -} - // func main() { -// go periodicUpdate() // not needed here +// go periodicAgentUpdate() // not needed here // cacheKey := "image-search" // userAgent, err := GetUserAgent(cacheKey) diff --git a/indexer.go b/indexer.go index 7963fc1..306c28d 100644 --- a/indexer.go +++ b/indexer.go @@ -3,6 +3,7 @@ package main import ( "bufio" "fmt" + "net/url" "os" "path/filepath" "strconv" @@ -10,10 +11,10 @@ import ( "time" "github.com/blevesearch/bleve/v2" + "golang.org/x/net/publicsuffix" ) // Document represents a single document to be indexed. -// You can add more fields if needed. type Document struct { ID string `json:"id"` Link string `json:"link"` @@ -48,16 +49,20 @@ func InitIndex() error { // Index doesn't exist, create a new one mapping := bleve.NewIndexMapping() - // Custom mapping for the document docMapping := bleve.NewDocumentMapping() - // Text fields with custom analyzers for better tokenization - textFieldMapping := bleve.NewTextFieldMapping() - textFieldMapping.Analyzer = "standard" // Use standard analyzer for partial and fuzzy matches + // Text fields + titleFieldMapping := bleve.NewTextFieldMapping() + titleFieldMapping.Analyzer = "standard" + docMapping.AddFieldMappingsAt("title", titleFieldMapping) - docMapping.AddFieldMappingsAt("title", textFieldMapping) - docMapping.AddFieldMappingsAt("description", textFieldMapping) - docMapping.AddFieldMappingsAt("tags", textFieldMapping) + descFieldMapping := bleve.NewTextFieldMapping() + descFieldMapping.Analyzer = "standard" + docMapping.AddFieldMappingsAt("description", descFieldMapping) + + tagFieldMapping := bleve.NewTextFieldMapping() + tagFieldMapping.Analyzer = "standard" + docMapping.AddFieldMappingsAt("tags", tagFieldMapping) // Numeric field for popularity popularityMapping := bleve.NewNumericFieldMapping() @@ -77,8 +82,19 @@ func InitIndex() error { return nil } +func normalizeDomain(rawURL string) string { + parsed, err := url.Parse(rawURL) + if err != nil { + return rawURL + } + domain, err := publicsuffix.EffectiveTLDPlusOne(parsed.Hostname()) + if err != nil { + return parsed.Hostname() // fallback + } + return domain +} + // IndexFile reads a file line-by-line and indexes each line as a document. -// Each line represents a simple document. Adjust parsing as needed. func IndexFile(filePath string) error { file, err := os.Open(filePath) if err != nil { @@ -88,27 +104,29 @@ func IndexFile(filePath string) error { scanner := bufio.NewScanner(file) batch := bleveIndex.NewBatch() - indexedDomains := make(map[string]bool) // Track indexed domains + + // Map to track normalized domains we’ve already indexed + indexedDomains := make(map[string]bool) for scanner.Scan() { line := scanner.Text() - // Split the line into 5 fields: link|title|tags|description|popularity + // link|title|tags|description|popularity parts := strings.SplitN(line, "|", 5) if len(parts) < 5 { - continue // Skip malformed lines + continue } - domain := parts[0] + // Normalize domain part so duplicates share the same “key” + normalized := normalizeDomain(parts[0]) popularity, _ := strconv.ParseInt(parts[4], 10, 64) - // Skip if the domain is already indexed - if indexedDomains[domain] { + if indexedDomains[normalized] { continue } doc := Document{ - ID: domain, // Use the domain as the unique ID + ID: normalized, Link: parts[0], Title: parts[1], Tags: parts[2], @@ -127,10 +145,9 @@ func IndexFile(filePath string) error { return fmt.Errorf("failed to index document: %v", err) } - indexedDomains[domain] = true // Mark the domain as indexed + indexedDomains[normalized] = true } - // Commit the batch if err := bleveIndex.Batch(batch); err != nil { return fmt.Errorf("error committing batch: %v", err) } @@ -139,13 +156,12 @@ func IndexFile(filePath string) error { return fmt.Errorf("error reading file: %v", err) } - printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath) + printDebug("Indexed %d unique normalized domains from %s", len(indexedDomains), filePath) return nil } // SearchIndex performs a full-text search on the indexed data. func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) { - // Create compound query exactMatch := bleve.NewMatchQuery(queryStr) // Exact match fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match fuzzyMatch.Fuzziness = 2 @@ -160,8 +176,8 @@ func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) { req.Size = pageSize req.From = (page - 1) * pageSize - // Sort by popularity - req.SortBy([]string{"popularity"}) + // Sort primarily by relevance (score), then by popularity descending + req.SortBy([]string{"-_score", "-popularity"}) res, err := bleveIndex.Search(req) if err != nil { diff --git a/init.go b/init.go index c92e656..7a6dba2 100644 --- a/init.go +++ b/init.go @@ -61,6 +61,11 @@ func main() { } config.PeerID = hostID + // Initiate Browser Agent updater + if config.CrawlerEnabled || config.IndexerEnabled { + go periodicAgentUpdate() + } + InitializeLanguage("en") // Initialize language before generating OpenSearch generateOpenSearchXML(config)