updated indexing & user agent generator

2024-12-31 02:44:14 +01:00 · 2024-12-31 02:44:14 +01:00 · a9a6948a44
commit a9a6948a44
parent 13e1d6119b
3 changed files with 73 additions and 35 deletions
--- a/indexer.go
+++ b/indexer.go
@ -3,6 +3,7 @@ package main
 import (
 	"bufio"
 	"fmt"
+	"net/url"
 	"os"
 	"path/filepath"
 	"strconv"
@ -10,10 +11,10 @@ import (
 	"time"

 	"github.com/blevesearch/bleve/v2"
+	"golang.org/x/net/publicsuffix"
 )

 // Document represents a single document to be indexed.
-// You can add more fields if needed.
 type Document struct {
 	ID          string `json:"id"`
 	Link        string `json:"link"`
@ -48,16 +49,20 @@ func InitIndex() error {
 		// Index doesn't exist, create a new one
 		mapping := bleve.NewIndexMapping()

-		// Custom mapping for the document
 		docMapping := bleve.NewDocumentMapping()

-		// Text fields with custom analyzers for better tokenization
-		textFieldMapping := bleve.NewTextFieldMapping()
-		textFieldMapping.Analyzer = "standard" // Use standard analyzer for partial and fuzzy matches
+		// Text fields
+		titleFieldMapping := bleve.NewTextFieldMapping()
+		titleFieldMapping.Analyzer = "standard"
+		docMapping.AddFieldMappingsAt("title", titleFieldMapping)

-		docMapping.AddFieldMappingsAt("title", textFieldMapping)
-		docMapping.AddFieldMappingsAt("description", textFieldMapping)
-		docMapping.AddFieldMappingsAt("tags", textFieldMapping)
+		descFieldMapping := bleve.NewTextFieldMapping()
+		descFieldMapping.Analyzer = "standard"
+		docMapping.AddFieldMappingsAt("description", descFieldMapping)
+
+		tagFieldMapping := bleve.NewTextFieldMapping()
+		tagFieldMapping.Analyzer = "standard"
+		docMapping.AddFieldMappingsAt("tags", tagFieldMapping)

 		// Numeric field for popularity
 		popularityMapping := bleve.NewNumericFieldMapping()
@ -77,8 +82,19 @@ func InitIndex() error {
 	return nil
 }

+func normalizeDomain(rawURL string) string {
+	parsed, err := url.Parse(rawURL)
+	if err != nil {
+		return rawURL
+	}
+	domain, err := publicsuffix.EffectiveTLDPlusOne(parsed.Hostname())
+	if err != nil {
+		return parsed.Hostname() // fallback
+	}
+	return domain
+}
+
 // IndexFile reads a file line-by-line and indexes each line as a document.
-// Each line represents a simple document. Adjust parsing as needed.
 func IndexFile(filePath string) error {
 	file, err := os.Open(filePath)
 	if err != nil {
@ -88,27 +104,29 @@ func IndexFile(filePath string) error {

 	scanner := bufio.NewScanner(file)
 	batch := bleveIndex.NewBatch()
-	indexedDomains := make(map[string]bool) // Track indexed domains
+
+	// Map to track normalized domains we’ve already indexed
+	indexedDomains := make(map[string]bool)

 	for scanner.Scan() {
 		line := scanner.Text()

-		// Split the line into 5 fields: link|title|tags|description|popularity
+		// link|title|tags|description|popularity
 		parts := strings.SplitN(line, "|", 5)
 		if len(parts) < 5 {
-			continue // Skip malformed lines
+			continue
 		}

-		domain := parts[0]
+		// Normalize domain part so duplicates share the same “key”
+		normalized := normalizeDomain(parts[0])
 		popularity, _ := strconv.ParseInt(parts[4], 10, 64)

-		// Skip if the domain is already indexed
-		if indexedDomains[domain] {
+		if indexedDomains[normalized] {
 			continue
 		}

 		doc := Document{
-			ID:          domain, // Use the domain as the unique ID
+			ID:          normalized,
 			Link:        parts[0],
 			Title:       parts[1],
 			Tags:        parts[2],
@ -127,10 +145,9 @@ func IndexFile(filePath string) error {
 			return fmt.Errorf("failed to index document: %v", err)
 		}

-		indexedDomains[domain] = true // Mark the domain as indexed
+		indexedDomains[normalized] = true
 	}

-	// Commit the batch
 	if err := bleveIndex.Batch(batch); err != nil {
 		return fmt.Errorf("error committing batch: %v", err)
 	}
@ -139,13 +156,12 @@ func IndexFile(filePath string) error {
 		return fmt.Errorf("error reading file: %v", err)
 	}

-	printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath)
+	printDebug("Indexed %d unique normalized domains from %s", len(indexedDomains), filePath)
 	return nil
 }

 // SearchIndex performs a full-text search on the indexed data.
 func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
-	// Create compound query
 	exactMatch := bleve.NewMatchQuery(queryStr) // Exact match
 	fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match
 	fuzzyMatch.Fuzziness = 2
@ -160,8 +176,8 @@ func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
 	req.Size = pageSize
 	req.From = (page - 1) * pageSize

-	// Sort by popularity
-	req.SortBy([]string{"popularity"})
+	// Sort primarily by relevance (score), then by popularity descending
+	req.SortBy([]string{"-_score", "-popularity"})

 	res, err := bleveIndex.Search(req)
 	if err != nil {