199 lines
5.1 KiB
Go
199 lines
5.1 KiB
Go
|
package main
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"fmt"
|
||
|
"os"
|
||
|
"path/filepath"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
"time"
|
||
|
|
||
|
"github.com/blevesearch/bleve/v2"
|
||
|
)
|
||
|
|
||
|
// Document represents a single document to be indexed.
|
||
|
// You can add more fields if needed.
|
||
|
type Document struct {
|
||
|
ID string `json:"id"`
|
||
|
Link string `json:"link"`
|
||
|
Title string `json:"title"`
|
||
|
Tags string `json:"tags"`
|
||
|
Description string `json:"description"`
|
||
|
Popularity int64 `json:"popularity"`
|
||
|
}
|
||
|
|
||
|
var (
|
||
|
// Global Bleve index handle
|
||
|
bleveIndex bleve.Index
|
||
|
)
|
||
|
|
||
|
func startPeriodicIndexing(filePath string, interval time.Duration) {
|
||
|
go func() {
|
||
|
for {
|
||
|
printDebug("Refreshing index from %s", filePath)
|
||
|
err := IndexFile(filePath)
|
||
|
if err != nil {
|
||
|
printErr("Failed to refresh index: %v", err)
|
||
|
}
|
||
|
time.Sleep(interval)
|
||
|
}
|
||
|
}()
|
||
|
}
|
||
|
|
||
|
// InitIndex ensures that the Bleve index is created or opened.
|
||
|
func InitIndex() error {
|
||
|
idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve"))
|
||
|
if err == bleve.ErrorIndexPathDoesNotExist {
|
||
|
// Index doesn't exist, create a new one
|
||
|
mapping := bleve.NewIndexMapping()
|
||
|
|
||
|
// Custom mapping for the document
|
||
|
docMapping := bleve.NewDocumentMapping()
|
||
|
|
||
|
// Text fields with custom analyzers for better tokenization
|
||
|
textFieldMapping := bleve.NewTextFieldMapping()
|
||
|
textFieldMapping.Analyzer = "standard" // Use standard analyzer for partial and fuzzy matches
|
||
|
|
||
|
docMapping.AddFieldMappingsAt("title", textFieldMapping)
|
||
|
docMapping.AddFieldMappingsAt("description", textFieldMapping)
|
||
|
docMapping.AddFieldMappingsAt("tags", textFieldMapping)
|
||
|
|
||
|
// Numeric field for popularity
|
||
|
popularityMapping := bleve.NewNumericFieldMapping()
|
||
|
docMapping.AddFieldMappingsAt("popularity", popularityMapping)
|
||
|
|
||
|
mapping.AddDocumentMapping("Document", docMapping)
|
||
|
|
||
|
idx, err = bleve.New(filepath.Join(config.DriveCache.Path, "index.bleve"), mapping)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("failed to create index: %v", err)
|
||
|
}
|
||
|
} else if err != nil {
|
||
|
return fmt.Errorf("failed to open index: %v", err)
|
||
|
}
|
||
|
|
||
|
bleveIndex = idx
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// IndexFile reads a file line-by-line and indexes each line as a document.
|
||
|
// Each line represents a simple document. Adjust parsing as needed.
|
||
|
func IndexFile(filePath string) error {
|
||
|
file, err := os.Open(filePath)
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("unable to open file for indexing: %v", err)
|
||
|
}
|
||
|
defer file.Close()
|
||
|
|
||
|
scanner := bufio.NewScanner(file)
|
||
|
batch := bleveIndex.NewBatch()
|
||
|
indexedDomains := make(map[string]bool) // Track indexed domains
|
||
|
|
||
|
for scanner.Scan() {
|
||
|
line := scanner.Text()
|
||
|
|
||
|
// Split the line into 5 fields: link|title|tags|description|popularity
|
||
|
parts := strings.SplitN(line, "|", 5)
|
||
|
if len(parts) < 5 {
|
||
|
continue // Skip malformed lines
|
||
|
}
|
||
|
|
||
|
domain := parts[0]
|
||
|
popularity, _ := strconv.ParseInt(parts[4], 10, 64)
|
||
|
|
||
|
// Skip if the domain is already indexed
|
||
|
if indexedDomains[domain] {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
doc := Document{
|
||
|
ID: domain, // Use the domain as the unique ID
|
||
|
Link: parts[0],
|
||
|
Title: parts[1],
|
||
|
Tags: parts[2],
|
||
|
Description: parts[3],
|
||
|
Popularity: popularity,
|
||
|
}
|
||
|
|
||
|
err := batch.Index(doc.ID, map[string]interface{}{
|
||
|
"title": doc.Title,
|
||
|
"description": doc.Description,
|
||
|
"link": doc.Link,
|
||
|
"tags": doc.Tags,
|
||
|
"popularity": doc.Popularity,
|
||
|
})
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("failed to index document: %v", err)
|
||
|
}
|
||
|
|
||
|
indexedDomains[domain] = true // Mark the domain as indexed
|
||
|
}
|
||
|
|
||
|
// Commit the batch
|
||
|
if err := bleveIndex.Batch(batch); err != nil {
|
||
|
return fmt.Errorf("error committing batch: %v", err)
|
||
|
}
|
||
|
|
||
|
if err := scanner.Err(); err != nil {
|
||
|
return fmt.Errorf("error reading file: %v", err)
|
||
|
}
|
||
|
|
||
|
printDebug("Indexed %d unique domains from %s\n", len(indexedDomains), filePath)
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// SearchIndex performs a full-text search on the indexed data.
|
||
|
func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
|
||
|
// Create compound query
|
||
|
exactMatch := bleve.NewMatchQuery(queryStr) // Exact match
|
||
|
fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match
|
||
|
fuzzyMatch.Fuzziness = 2
|
||
|
prefixMatch := bleve.NewPrefixQuery(queryStr) // Prefix match
|
||
|
|
||
|
query := bleve.NewDisjunctionQuery(exactMatch, fuzzyMatch, prefixMatch)
|
||
|
|
||
|
req := bleve.NewSearchRequest(query)
|
||
|
req.Fields = []string{"title", "description", "link", "tags", "popularity"}
|
||
|
|
||
|
// Pagination
|
||
|
req.Size = pageSize
|
||
|
req.From = (page - 1) * pageSize
|
||
|
|
||
|
// Sort by popularity
|
||
|
req.SortBy([]string{"popularity"})
|
||
|
|
||
|
res, err := bleveIndex.Search(req)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("search error: %v", err)
|
||
|
}
|
||
|
|
||
|
var docs []Document
|
||
|
for _, hit := range res.Hits {
|
||
|
title := fmt.Sprintf("%v", hit.Fields["title"])
|
||
|
description := fmt.Sprintf("%v", hit.Fields["description"])
|
||
|
link := fmt.Sprintf("%v", hit.Fields["link"])
|
||
|
tags := fmt.Sprintf("%v", hit.Fields["tags"])
|
||
|
popularity := int64(0)
|
||
|
|
||
|
if pop, ok := hit.Fields["popularity"].(float64); ok {
|
||
|
popularity = int64(pop)
|
||
|
}
|
||
|
|
||
|
if link == "<nil>" || title == "<nil>" {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
docs = append(docs, Document{
|
||
|
ID: hit.ID,
|
||
|
Title: title,
|
||
|
Description: description,
|
||
|
Link: link,
|
||
|
Tags: tags,
|
||
|
Popularity: popularity,
|
||
|
})
|
||
|
}
|
||
|
|
||
|
return docs, nil
|
||
|
}
|