updated indexing & user agent generator
This commit is contained in:
parent
13e1d6119b
commit
a9a6948a44
3 changed files with 73 additions and 35 deletions
60
indexer.go
60
indexer.go
|
@ -3,6 +3,7 @@ package main
|
|||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
@ -10,10 +11,10 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/v2"
|
||||
"golang.org/x/net/publicsuffix"
|
||||
)
|
||||
|
||||
// Document represents a single document to be indexed.
|
||||
// You can add more fields if needed.
|
||||
type Document struct {
|
||||
ID string `json:"id"`
|
||||
Link string `json:"link"`
|
||||
|
@ -48,16 +49,20 @@ func InitIndex() error {
|
|||
// Index doesn't exist, create a new one
|
||||
mapping := bleve.NewIndexMapping()
|
||||
|
||||
// Custom mapping for the document
|
||||
docMapping := bleve.NewDocumentMapping()
|
||||
|
||||
// Text fields with custom analyzers for better tokenization
|
||||
textFieldMapping := bleve.NewTextFieldMapping()
|
||||
textFieldMapping.Analyzer = "standard" // Use standard analyzer for partial and fuzzy matches
|
||||
// Text fields
|
||||
titleFieldMapping := bleve.NewTextFieldMapping()
|
||||
titleFieldMapping.Analyzer = "standard"
|
||||
docMapping.AddFieldMappingsAt("title", titleFieldMapping)
|
||||
|
||||
docMapping.AddFieldMappingsAt("title", textFieldMapping)
|
||||
docMapping.AddFieldMappingsAt("description", textFieldMapping)
|
||||
docMapping.AddFieldMappingsAt("tags", textFieldMapping)
|
||||
descFieldMapping := bleve.NewTextFieldMapping()
|
||||
descFieldMapping.Analyzer = "standard"
|
||||
docMapping.AddFieldMappingsAt("description", descFieldMapping)
|
||||
|
||||
tagFieldMapping := bleve.NewTextFieldMapping()
|
||||
tagFieldMapping.Analyzer = "standard"
|
||||
docMapping.AddFieldMappingsAt("tags", tagFieldMapping)
|
||||
|
||||
// Numeric field for popularity
|
||||
popularityMapping := bleve.NewNumericFieldMapping()
|
||||
|
@ -77,8 +82,19 @@ func InitIndex() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func normalizeDomain(rawURL string) string {
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return rawURL
|
||||
}
|
||||
domain, err := publicsuffix.EffectiveTLDPlusOne(parsed.Hostname())
|
||||
if err != nil {
|
||||
return parsed.Hostname() // fallback
|
||||
}
|
||||
return domain
|
||||
}
|
||||
|
||||
// IndexFile reads a file line-by-line and indexes each line as a document.
|
||||
// Each line represents a simple document. Adjust parsing as needed.
|
||||
func IndexFile(filePath string) error {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
|
@ -88,27 +104,29 @@ func IndexFile(filePath string) error {
|
|||
|
||||
scanner := bufio.NewScanner(file)
|
||||
batch := bleveIndex.NewBatch()
|
||||
indexedDomains := make(map[string]bool) // Track indexed domains
|
||||
|
||||
// Map to track normalized domains we’ve already indexed
|
||||
indexedDomains := make(map[string]bool)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
// Split the line into 5 fields: link|title|tags|description|popularity
|
||||
// link|title|tags|description|popularity
|
||||
parts := strings.SplitN(line, "|", 5)
|
||||
if len(parts) < 5 {
|
||||
continue // Skip malformed lines
|
||||
continue
|
||||
}
|
||||
|
||||
domain := parts[0]
|
||||
// Normalize domain part so duplicates share the same “key”
|
||||
normalized := normalizeDomain(parts[0])
|
||||
popularity, _ := strconv.ParseInt(parts[4], 10, 64)
|
||||
|
||||
// Skip if the domain is already indexed
|
||||
if indexedDomains[domain] {
|
||||
if indexedDomains[normalized] {
|
||||
continue
|
||||
}
|
||||
|
||||
doc := Document{
|
||||
ID: domain, // Use the domain as the unique ID
|
||||
ID: normalized,
|
||||
Link: parts[0],
|
||||
Title: parts[1],
|
||||
Tags: parts[2],
|
||||
|
@ -127,10 +145,9 @@ func IndexFile(filePath string) error {
|
|||
return fmt.Errorf("failed to index document: %v", err)
|
||||
}
|
||||
|
||||
indexedDomains[domain] = true // Mark the domain as indexed
|
||||
indexedDomains[normalized] = true
|
||||
}
|
||||
|
||||
// Commit the batch
|
||||
if err := bleveIndex.Batch(batch); err != nil {
|
||||
return fmt.Errorf("error committing batch: %v", err)
|
||||
}
|
||||
|
@ -139,13 +156,12 @@ func IndexFile(filePath string) error {
|
|||
return fmt.Errorf("error reading file: %v", err)
|
||||
}
|
||||
|
||||
printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath)
|
||||
printDebug("Indexed %d unique normalized domains from %s", len(indexedDomains), filePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
// SearchIndex performs a full-text search on the indexed data.
|
||||
func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
|
||||
// Create compound query
|
||||
exactMatch := bleve.NewMatchQuery(queryStr) // Exact match
|
||||
fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match
|
||||
fuzzyMatch.Fuzziness = 2
|
||||
|
@ -160,8 +176,8 @@ func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
|
|||
req.Size = pageSize
|
||||
req.From = (page - 1) * pageSize
|
||||
|
||||
// Sort by popularity
|
||||
req.SortBy([]string{"popularity"})
|
||||
// Sort primarily by relevance (score), then by popularity descending
|
||||
req.SortBy([]string{"-_score", "-popularity"})
|
||||
|
||||
res, err := bleveIndex.Search(req)
|
||||
if err != nil {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue