diff --git a/.gitignore b/.gitignore index 118b838..5f5aeab 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ image_cache/ cache/ *.min.js *.min.css -qgato \ No newline at end of file +qgato +test.py \ No newline at end of file diff --git a/README.md b/README.md index 23e8bf5..5ad3337 100644 --- a/README.md +++ b/README.md @@ -7,30 +7,30 @@
-A self-hosted private metasearch engine that aims to be more resource-efficient than its competition. +A self-hosted private search engine designed to be scalable and more resource-efficient than its competitors.
# Bare in mind that this project is still WIP -## Comparison to other search engines +## Comparison to other open-source search engines -| Feature | Whoogle [1] | Araa-Search | LibreY | 4get | SearchXNG | *QGato* | -| :------------------------- | ------------------ | ------------------------- | ------------------------ | ------------------------ | ------------------------- | ---------------------------------------------------- | -| Works without JavaScript | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Music search | ❓ | ❌ | ❌ | ✅ | ✅ | ✅ | -| Torrent search | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | -| API | ❌ | ❓ [2] | ✅ | ✅ | ✅ | ✅ | -| Scalable | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | -| Not Resource Hungry | ❓ Moderate | ❌ Very resource hungry | ❌ Moderate 200-400mb~ | ❌ Moderate 200-400mb~ | ❌ Moderate 200-300MiB~ | ✅ about 15-20MiB at idle, 17-22MiB when searching | -| Result caching | ❌ | ❌ | ❓ | ❓ | ❓ | ✅ | -| Dynamic Page Loading | ❓ Not specified | ❌ | ❌ | ❌ | ✅ | ✅ | -| User themable | ❌ | ✅ | ❌ | ❌ | ✅[3] | ✅ | -| Unusual logo choice | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| Feature | Whoogle [1] | Araa-Search | LibreY | 4get | SearchXNG | *QGato* | +| :------------------------- | ------------- | ------------------------- | ------------------------ | ------------------------ | ------------------------- | --------------------------------------- | +| Works without JavaScript | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Music search | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | +| Torrent search | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | +| API | ❌ | ❌ [2] | ✅ | ✅ | ✅ | ✅ | +| Scalable | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | +| Not Resource Hungry | ❓ Moderate | ❌ Very resource hungry | ❌ Moderate 200-400mb~ | ❌ Moderate 200-400mb~ | ❌ Moderate 200-300MiB~ | ✅ about 15-30MiB even when searching | +| Result caching | ❓ | ❓ | ❓ | ❓ | ❓ | ✅ | +| Dynamic Page Loading | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | +| User themable | ❌ | ✅ | ❌ | ❌ | ❓[3] | ✅ | +| Unusual logo choice | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | [1]: I was not able to check this since their site does not work, same for the community instances. -[2]: In the project repo they specify that it has API, but It looks like they are no loger supporting it. Or just removed "API" button and documentation, since I was not able to find it anymore. +[2]: In the project repo they specify that it has API, but It looks like they are no longer supporting it. Or just removed "API" button and documentation, since I was not able to find it anymore. [3]: It is called 'User Themable' because you want to give the user freedom of choice for their theme, not by hard-setting one theme in the backend and calling it themable. @@ -48,7 +48,7 @@ A self-hosted private ### For Self-Hosting - **Self-hosted option** - Run on your own server for even more privacy. -- **Lightweight** - Low memory footprint (15-22MiB) even during searches. +- **Lightweight** - Low memory footprint (15-30MiB) even during searches. - **Decentralized** - No single point of failure. - **Results caching in RAM** - Faster response times through caching. - **Configurable** - Tweak features via `config.ini`. @@ -67,7 +67,7 @@ A self-hosted private ### Prerequisites -- Go (version 1.18 or higher recommended) +- Go (version 1.23 or higher recommended) - Git (unexpected) - Access to the internet for fetching results (even more unexpected) diff --git a/agent.go b/agent.go index 296b4e4..6333102 100755 --- a/agent.go +++ b/agent.go @@ -3,7 +3,7 @@ package main import ( "encoding/json" "fmt" - "io/ioutil" + "io" "math/rand" "net/http" "sort" @@ -40,13 +40,33 @@ var ( func fetchLatestBrowserVersions() (BrowserData, error) { url := "https://raw.githubusercontent.com/Fyrd/caniuse/master/fulldata-json/data-2.0.json" - resp, err := http.Get(url) + // // Optional: skip TLS verification to avoid certificate errors + // transport := &http.Transport{ + // TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + // } + + // Increase the HTTP client timeout + client := &http.Client{ + Timeout: 30 * time.Second, + // Transport: transport, + } + + // Build the request manually to set headers + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return BrowserData{}, err + } + // Custom user agent and English language preference + req.Header.Set("User-Agent", "MyCustomAgent/1.0 (compatible; +https://example.com)") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + + resp, err := client.Do(req) if err != nil { return BrowserData{}, err } defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return BrowserData{}, err } @@ -109,7 +129,7 @@ func randomUserAgent() (string, error) { return "", err } - rand.Seed(time.Now().UnixNano()) + rand := rand.New(rand.NewSource(time.Now().UnixNano())) // Simulated browser usage statistics (in percentages) usageStats := map[string]float64{ @@ -161,6 +181,7 @@ func randomUserAgent() (string, error) { } } + // Fallback to the last version if none matched if version == "" { version = versions[len(versions)-1].Version } @@ -240,11 +261,11 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string { browserType = "Firefox" } - // Get the latest version for the browser type + // Get the latest version for that browser var latestVersion string - if browserType == "Firefox" { + if browserType == "Firefox" && len(newVersions.Firefox) > 0 { latestVersion = newVersions.Firefox[0].Version - } else if browserType == "Chromium" { + } else if browserType == "Chromium" && len(newVersions.Chromium) > 0 { latestVersion = newVersions.Chromium[0].Version } @@ -252,7 +273,7 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string { return generateUserAgent(browserType, latestVersion) } -func periodicUpdate() { +func periodicAgentUpdate() { for { // Sleep for a random interval between 1 and 2 days time.Sleep(time.Duration(24+rand.Intn(24)) * time.Hour) @@ -309,12 +330,8 @@ func GetNewUserAgent(cacheKey string) (string, error) { return userAgent, nil } -func init() { - go periodicUpdate() -} - // func main() { -// go periodicUpdate() // not needed here +// go periodicAgentUpdate() // not needed here // cacheKey := "image-search" // userAgent, err := GetUserAgent(cacheKey) diff --git a/cache-images.go b/cache-images.go index 16d686e..4e551cd 100644 --- a/cache-images.go +++ b/cache-images.go @@ -24,15 +24,15 @@ import ( ) var ( - cachingImages = make(map[string]*sync.Mutex) - cachingImagesMu sync.Mutex - // cachingSemaphore = make(chan struct{}, 100) // Limit to concurrent downloads + cachingImages = make(map[string]*sync.Mutex) + cachingImagesMu sync.Mutex + cachingSemaphore = make(chan struct{}, 100) invalidImageIDs = make(map[string]struct{}) invalidImageIDsMu sync.Mutex - imageURLMap = make(map[string]string) // mapping from imageID_type to imageURL - imageURLMapMu sync.RWMutex // mutex for thread-safe access + imageURLMap = make(map[string]string) + imageURLMapMu sync.RWMutex ) func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error) { @@ -49,7 +49,13 @@ func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error filename = fmt.Sprintf("%s_full.webp", imageID) } - cachedImagePath := filepath.Join(config.DriveCache.Path, filename) + // Make sure we store inside: config.DriveCache.Path / images + imageCacheDir := filepath.Join(config.DriveCache.Path, "images") + if err := os.MkdirAll(imageCacheDir, 0755); err != nil { + return "", false, fmt.Errorf("couldn't create images folder: %v", err) + } + + cachedImagePath := filepath.Join(imageCacheDir, filename) tempImagePath := cachedImagePath + ".tmp" // Check if the image is already cached @@ -73,9 +79,8 @@ func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error return cachedImagePath, true, nil } - // // Limit max concurrent downloads - // cachingSemaphore <- struct{}{} // Acquire a token - // defer func() { <-cachingSemaphore }() // Release the token + cachingSemaphore <- struct{}{} + defer func() { <-cachingSemaphore }() // Create a custom http.Client that skips SSL certificate verification client := &http.Client{ @@ -217,7 +222,8 @@ func handleImageServe(w http.ResponseWriter, r *http.Request) { imageType = parts[1] filename := fmt.Sprintf("%s_%s.webp", imageID, imageType) - cachedImagePath := filepath.Join(config.DriveCache.Path, filename) + // Adjust to read from config.DriveCache.Path / images + cachedImagePath := filepath.Join(config.DriveCache.Path, "images", filename) if hasExtension && imageType == "thumb" { // Requesting cached image (thumbnail or full) @@ -329,7 +335,7 @@ func handleImageStatus(w http.ResponseWriter, r *http.Request) { // Check thumbnail first for _, ext := range extensions { thumbFilename := fmt.Sprintf("%s_thumb.%s", id, ext) - thumbPath := filepath.Join(config.DriveCache.Path, thumbFilename) + thumbPath := filepath.Join(config.DriveCache.Path, "images", thumbFilename) if _, err := os.Stat(thumbPath); err == nil { statusMap[id] = fmt.Sprintf("/image/%s_thumb.%s", id, ext) @@ -342,7 +348,7 @@ func handleImageStatus(w http.ResponseWriter, r *http.Request) { if !imageReady { for _, ext := range extensions { fullFilename := fmt.Sprintf("%s_full.%s", id, ext) - fullPath := filepath.Join(config.DriveCache.Path, fullFilename) + fullPath := filepath.Join(config.DriveCache.Path, "images", fullFilename) if _, err := os.Stat(fullPath); err == nil { statusMap[id] = fmt.Sprintf("/image/%s_full.%s", id, ext) @@ -447,7 +453,9 @@ func cleanExpiredCachedImages() { } func cleanupCache() { - files, err := os.ReadDir(config.DriveCache.Path) + // Read from: config.DriveCache.Path / images + imageCacheDir := filepath.Join(config.DriveCache.Path, "images") + files, err := os.ReadDir(imageCacheDir) if err != nil { printErr("Failed to read DriveCache directory: %v", err) return @@ -462,19 +470,17 @@ func cleanupCache() { continue } - filePath := filepath.Join(config.DriveCache.Path, file.Name()) + filePath := filepath.Join(imageCacheDir, file.Name()) - // Check for expired files based on modification time if config.DriveCache.Duration > 0 && time.Since(info.ModTime()) > config.DriveCache.Duration { if err := os.Remove(filePath); err == nil { printDebug("Removed expired cache file: %s", filePath) } else { printErr("Failed to remove expired cache file: %s", filePath) } - continue // Skip adding this file to the list + continue } - // Accumulate total size and store file info for potential deletion totalSize += uint64(info.Size()) fileInfos = append(fileInfos, info) } @@ -491,7 +497,7 @@ func cleanupCache() { break } - filePath := filepath.Join(config.DriveCache.Path, info.Name()) + filePath := filepath.Join(imageCacheDir, info.Name()) fileSize := uint64(info.Size()) if err := os.Remove(filePath); err == nil { diff --git a/cache.go b/cache.go index b5ad880..ac2902d 100644 --- a/cache.go +++ b/cache.go @@ -162,7 +162,7 @@ func (rc *ResultsCache) keyToString(key CacheKey) string { // checkAndCleanCache removes items if memory usage exceeds the limit. func (rc *ResultsCache) checkAndCleanCache() { - for rc.currentMemoryUsage() > config.RamCache.MaxUsageBytes { + if rc.currentMemoryUsage() > config.RamCache.MaxUsageBytes { rc.cleanOldestItems() } } diff --git a/config.go b/config.go index c3aec6b..18d83cf 100644 --- a/config.go +++ b/config.go @@ -23,33 +23,45 @@ type CacheConfig struct { } type Config struct { - Port int // Added - AuthCode string // Added - PeerID string // Added - Peers []string - Domain string // Added - NodesEnabled bool // Added - CrawlerEnabled bool // Added - WebsiteEnabled bool // Added - RamCacheEnabled bool - DriveCacheEnabled bool // Added - LogLevel int // Added + Port int // Added + AuthCode string // Added + PeerID string // Added + Peers []string + Domain string // Added + NodesEnabled bool // Added + CrawlerEnabled bool // Added + IndexerEnabled bool // Added + WebsiteEnabled bool // Added + RamCacheEnabled bool + DriveCacheEnabled bool // Added + LogLevel int // Added + ConcurrentStandardCrawlers int + ConcurrentChromeCrawlers int + CrawlingInterval time.Duration // Refres crawled results in... + MaxPagesPerDomain int // Max pages to crawl per domain + IndexBatchSize int DriveCache CacheConfig RamCache CacheConfig } var defaultConfig = Config{ - Port: 5000, - Domain: "localhost", - Peers: []string{}, - AuthCode: generateStrongRandomString(64), - NodesEnabled: false, - CrawlerEnabled: true, - WebsiteEnabled: true, - RamCacheEnabled: true, - DriveCacheEnabled: false, - LogLevel: 1, + Port: 5000, + Domain: "localhost", + Peers: []string{}, + AuthCode: generateStrongRandomString(64), + NodesEnabled: false, + CrawlerEnabled: true, + IndexerEnabled: false, + WebsiteEnabled: true, + RamCacheEnabled: true, + DriveCacheEnabled: false, + ConcurrentStandardCrawlers: 12, + ConcurrentChromeCrawlers: 4, + CrawlingInterval: 24 * time.Hour, + MaxPagesPerDomain: 10, + IndexBatchSize: 50, + LogLevel: 1, DriveCache: CacheConfig{ Duration: 48 * time.Hour, // Added Path: "./cache", // Added @@ -105,6 +117,15 @@ func createConfig() error { config.Domain = defaultConfig.Domain } + // printMessage("Use Indexer? (YES/no): ") + // indexerChoice, _ := reader.ReadString('\n') + // indexerChoice = strings.TrimSpace(strings.ToLower(indexerChoice)) + // if indexerChoice == "no" { + // config.IndexerEnabled = false + // } else { + // config.IndexerEnabled = true + // } + // Cache settings printMessage("Would you like to configure Cache settings (yes/NO): ") configureCache, _ := reader.ReadString('\n') @@ -181,7 +202,7 @@ func createConfig() error { } else { config.DriveCache.MaxUsageBytes = parseMaxUsageDrive(driveMaxUsage, drivePath) if config.DriveCache.MaxUsageBytes == 0 { - printWarn("Invalid DriveCache max usage, using default (1 TiB).") + printWarn("Invalid DriveCache max usage, using default.") config.DriveCache.MaxUsageBytes = defaultConfig.DriveCache.MaxUsageBytes } } @@ -201,13 +222,6 @@ func createConfig() error { printMessage("Generated connection code: %s\n", config.AuthCode) } - // Set other default values - config.NodesEnabled = defaultConfig.NodesEnabled - config.CrawlerEnabled = defaultConfig.CrawlerEnabled - config.WebsiteEnabled = defaultConfig.WebsiteEnabled - config.LogLevel = defaultConfig.LogLevel - - // Save configuration to file saveConfig(config) printInfo("Configuration saved successfully.") return nil @@ -232,9 +246,16 @@ func saveConfig(config Config) { featuresSec := cfg.Section("Features") featuresSec.Key("Nodes").SetValue(strconv.FormatBool(config.NodesEnabled)) featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled)) + featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled)) featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled)) - featuresSec.Key("RamCache").SetValue(strconv.FormatBool(config.RamCacheEnabled)) - featuresSec.Key("DriveCache").SetValue(strconv.FormatBool(config.DriveCacheEnabled)) + + // Indexer section + indexerSec := cfg.Section("Indexer") + indexerSec.Key("ConcurrentStandardCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) + indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) + indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) + indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) + indexerSec.Key("IndexBatchSize").SetValue(strconv.Itoa(config.IndexBatchSize)) // DriveCache section driveSec := cfg.Section("DriveCache") @@ -261,51 +282,63 @@ func loadConfig() Config { } // Server - port, _ := cfg.Section("Server").Key("Port").Int() - domain := cfg.Section("Server").Key("Domain").String() - logLevel, _ := cfg.Section("Server").Key("LogLevel").Int() + port := getConfigValue(cfg.Section("Server").Key("Port"), defaultConfig.Port, strconv.Atoi) + domain := getConfigValueString(cfg.Section("Server").Key("Domain"), defaultConfig.Domain) + logLevel := getConfigValue(cfg.Section("Server").Key("LogLevel"), defaultConfig.LogLevel, strconv.Atoi) // Peers - authCode := cfg.Section("Peers").Key("AuthCode").String() - peersStr := cfg.Section("Peers").Key("Peers").String() - peers := strings.Split(peersStr, ",") + authCode := getConfigValueString(cfg.Section("Peers").Key("AuthCode"), defaultConfig.AuthCode) + peers := strings.Split(getConfigValueString(cfg.Section("Peers").Key("Peers"), ""), ",") // Features - nodesEnabled, _ := cfg.Section("Features").Key("Nodes").Bool() - crawlerEnabled, _ := cfg.Section("Features").Key("Crawler").Bool() - websiteEnabled, _ := cfg.Section("Features").Key("Website").Bool() - ramCacheEnabled, _ := cfg.Section("Features").Key("RamCache").Bool() - driveCacheEnabled, _ := cfg.Section("Features").Key("DriveCache").Bool() + nodesEnabled := getConfigValueBool(cfg.Section("Features").Key("Nodes"), defaultConfig.NodesEnabled) + crawlerEnabled := getConfigValueBool(cfg.Section("Features").Key("Crawler"), defaultConfig.CrawlerEnabled) + indexerEnabled := getConfigValueBool(cfg.Section("Features").Key("Indexer"), defaultConfig.IndexerEnabled) + websiteEnabled := getConfigValueBool(cfg.Section("Features").Key("Website"), defaultConfig.WebsiteEnabled) + ramCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("RamCache"), defaultConfig.RamCacheEnabled) + driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled) + + // Indexing + concurrentStandardCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentStandardCrawlers"), defaultConfig.ConcurrentStandardCrawlers, strconv.Atoi) + concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi) + crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) + maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) + indexBatchSize := getConfigValue(cfg.Section("Indexer").Key("IndexBatchSize"), defaultConfig.IndexBatchSize, strconv.Atoi) // DriveCache - driveDuration, _ := time.ParseDuration(cfg.Section("DriveCache").Key("Duration").String()) - drivePath := cfg.Section("DriveCache").Key("Path").String() - driveMaxUsage := parseMaxUsageDrive(cfg.Section("DriveCache").Key("MaxUsage").String(), drivePath) + driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration) + drivePath := getConfigValueString(cfg.Section("DriveCache").Key("Path"), defaultConfig.DriveCache.Path) + driveMaxUsage := parseMaxUsageDrive(getConfigValueString(cfg.Section("DriveCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.DriveCache.MaxUsageBytes)), drivePath) // maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int() // if maxConcurrentDownloads == 0 { // maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads // } // RamCache - ramDuration, _ := time.ParseDuration(cfg.Section("RamCache").Key("Duration").String()) - ramMaxUsage := parseMaxUsageRam(cfg.Section("RamCache").Key("MaxUsage").String()) + ramDuration := getConfigValue(cfg.Section("RamCache").Key("Duration"), defaultConfig.RamCache.Duration, time.ParseDuration) + ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes))) return Config{ - Port: port, - Domain: domain, - LogLevel: logLevel, - AuthCode: authCode, // Assign AuthCode here - Peers: peers, - NodesEnabled: nodesEnabled, - CrawlerEnabled: crawlerEnabled, - WebsiteEnabled: websiteEnabled, - RamCacheEnabled: ramCacheEnabled, - DriveCacheEnabled: driveCacheEnabled, + Port: port, + Domain: domain, + LogLevel: logLevel, + AuthCode: authCode, + Peers: peers, + NodesEnabled: nodesEnabled, + CrawlerEnabled: crawlerEnabled, + IndexerEnabled: indexerEnabled, + WebsiteEnabled: websiteEnabled, + RamCacheEnabled: ramCacheEnabled, + DriveCacheEnabled: driveCacheEnabled, + ConcurrentStandardCrawlers: concurrentStandardCrawlers, + ConcurrentChromeCrawlers: concurrentChromeCrawlers, + CrawlingInterval: crawlingInterval, + MaxPagesPerDomain: maxPagesPerDomain, + IndexBatchSize: indexBatchSize, DriveCache: CacheConfig{ Duration: driveDuration, MaxUsageBytes: driveMaxUsage, Path: drivePath, - // MaxConcurrentThumbnailDownloads: maxConcurrentDownloads, }, RamCache: CacheConfig{ Duration: ramDuration, @@ -314,6 +347,34 @@ func loadConfig() Config { } } +// getConfigValue retrieves a configuration value or returns a default value from defaultConfig. +func getConfigValue[T any](key *ini.Key, defaultValue T, parseFunc func(string) (T, error)) T { + if key == nil || key.String() == "" { + return defaultValue + } + value, err := parseFunc(key.String()) + if err != nil { + return defaultValue + } + return value +} + +// getConfigValueString retrieves a string value or falls back to the default. +func getConfigValueString(key *ini.Key, defaultValue string) string { + if key == nil || key.String() == "" { + return defaultValue + } + return key.String() +} + +// getConfigValueBool retrieves a boolean value or falls back to the default. +func getConfigValueBool(key *ini.Key, defaultValue bool) bool { + if key == nil || key.String() == "" { + return defaultValue + } + return key.MustBool(defaultValue) +} + // Helper to parse MaxUsage string into bytes func parseMaxUsageRam(value string) uint64 { const GiB = 1024 * 1024 * 1024 diff --git a/crawler-extraction.go b/crawler-extraction.go new file mode 100644 index 0000000..4ce8b9d --- /dev/null +++ b/crawler-extraction.go @@ -0,0 +1,248 @@ +package main + +import ( + "context" + "net/http" + "net/url" + "strings" + "time" + + "github.com/chromedp/cdproto/emulation" + "github.com/chromedp/chromedp" + "github.com/go-shiori/go-readability" + "golang.org/x/net/html" +) + +// fetchPageMetadataStandard tries standard HTML parse + go-readability only. +func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) { + // 1. Standard HTML parse + title, desc, keywords := extractStandard(pageURL, userAgent) + + // 2. Fallback: go-readability + if title == "" || desc == "" { + title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords) + } + + // If still empty, return ("", "", "") + if title == "" || desc == "" { + return "", "", "" + } + return sanitize(title), sanitize(desc), sanitize(keywords) +} + +// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages. +func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) { + // Create context + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + + var renderedHTML string + err := chromedp.Run(ctx, + emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"), + chromedp.Navigate(pageURL), + chromedp.Sleep(2*time.Second), // Let JS run a bit + chromedp.OuterHTML("html", &renderedHTML), + ) + if err != nil { + printDebug("chromedp error for %s: %v", pageURL, err) + return "", "", "" + } + + doc, err := html.Parse(strings.NewReader(renderedHTML)) + if err != nil { + printDebug("chromedp parse error for %s: %v", pageURL, err) + return "", "", "" + } + + return extractParsedDOM(doc) +} + +// extractStandard does the normal HTML parse with OG, Twitter, etc. +func extractStandard(pageURL, userAgent string) (title, desc, keywords string) { + client := &http.Client{Timeout: 15 * time.Second} + req, err := http.NewRequest("GET", pageURL, nil) + if err != nil { + printDebug("Failed to create request for %s: %v", pageURL, err) + return + } + req.Header.Set("User-Agent", userAgent) + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + + resp, err := client.Do(req) + if err != nil { + printDebug("Failed to GET %s: %v", pageURL, err) + return + } + defer resp.Body.Close() + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) + return + } + + doc, err := html.Parse(resp.Body) + if err != nil { + printDebug("HTML parse error for %s: %v", pageURL, err) + return + } + + return extractParsedDOM(doc) +} + +// extractParsedDOM uses the same logic to parse 2 {
+ return true
+ }
+ return false
+}
+
+// sanitize removes pipes/newlines so they don't break our output format.
+func sanitize(input string) string {
+ input = strings.ReplaceAll(input, "|", " ")
+ input = strings.ReplaceAll(input, "\n", " ")
+ return strings.TrimSpace(input)
+}
diff --git a/crawler-visited.go b/crawler-visited.go
new file mode 100644
index 0000000..bfa1af9
--- /dev/null
+++ b/crawler-visited.go
@@ -0,0 +1,106 @@
+package main
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "sync"
+)
+
+// VisitedStore handles deduplicating visited URLs with a map and a periodic flush to disk.
+type VisitedStore struct {
+ mu sync.Mutex
+ visited map[string]bool
+ toFlush []string
+
+ filePath string
+ batchSize int // how many new URLs we batch before flushing
+}
+
+// NewVisitedStore creates or loads the visited URLs from filePath.
+func NewVisitedStore(filePath string, batchSize int) (*VisitedStore, error) {
+ store := &VisitedStore{
+ visited: make(map[string]bool),
+ filePath: filePath,
+ batchSize: batchSize,
+ }
+
+ // Attempt to load existing visited URLs (if file exists).
+ if _, err := os.Stat(filePath); err == nil {
+ if err := store.loadFromFile(); err != nil {
+ return nil, fmt.Errorf("loadFromFile error: %w", err)
+ }
+ }
+ return store, nil
+}
+
+// loadFromFile loads visited URLs from the store’s file. One URL per line.
+func (s *VisitedStore) loadFromFile() error {
+ f, err := os.Open(s.filePath)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ url := scanner.Text()
+ s.visited[url] = true
+ }
+ return scanner.Err()
+}
+
+// AlreadyVisited returns true if the URL is in the store.
+func (s *VisitedStore) AlreadyVisited(url string) bool {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ return s.visited[url]
+}
+
+// MarkVisited adds the URL to the store if not already present, and triggers a flush if batchSize is reached.
+func (s *VisitedStore) MarkVisited(url string) (added bool, err error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ if s.visited[url] {
+ return false, nil
+ }
+ // Mark in memory
+ s.visited[url] = true
+ s.toFlush = append(s.toFlush, url)
+
+ // Flush if we have enough new URLs
+ if len(s.toFlush) >= s.batchSize {
+ if err := s.flushToFileUnlocked(); err != nil {
+ return false, err
+ }
+ }
+ return true, nil
+}
+
+// Flush everything in s.toFlush to file, then clear the buffer.
+func (s *VisitedStore) Flush() error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ return s.flushToFileUnlocked()
+}
+
+// flushToFileUnlocked writes s.toFlush lines to the store file, then clears s.toFlush.
+func (s *VisitedStore) flushToFileUnlocked() error {
+ if len(s.toFlush) == 0 {
+ return nil
+ }
+ f, err := os.OpenFile(s.filePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ for _, url := range s.toFlush {
+ if _, err := fmt.Fprintln(f, url); err != nil {
+ return err
+ }
+ }
+ s.toFlush = nil
+ return nil
+}
diff --git a/crawler.go b/crawler.go
new file mode 100644
index 0000000..8caa073
--- /dev/null
+++ b/crawler.go
@@ -0,0 +1,210 @@
+package main
+
+import (
+ "bufio"
+ "os"
+ "path/filepath"
+ "strings"
+ "sync"
+ "time"
+)
+
+// Create a global or config-level visited store
+var visitedStore *VisitedStore
+
+// webCrawlerInit is called during init on program start
+func webCrawlerInit() {
+ store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), config.IndexBatchSize)
+ if err != nil {
+ printErr("Failed to initialize visited store: %v", err)
+ }
+ visitedStore = store
+
+ // Start the periodic crawler
+ go func() {
+ // First run immediately
+ runCrawlerAndIndexer()
+
+ // Then run periodically
+ ticker := time.NewTicker(config.CrawlingInterval)
+ for range ticker.C {
+ runCrawlerAndIndexer()
+ }
+ }()
+}
+
+// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes
+func runCrawlerAndIndexer() {
+ // 1. Read domains.csv
+ domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))
+ if err != nil {
+ printErr("Error reading domains.csv: %v", err)
+ return
+ }
+
+ // 2. Crawl each domain and write results to data_to_index.txt
+ if err := crawlDomainsToFile(domains, config.MaxPagesPerDomain); err != nil {
+ printErr("Error crawling domains: %v", err)
+ return
+ }
+
+ // After finishing crawling, flush any pending visited-urls
+ if visitedStore != nil {
+ if err := visitedStore.Flush(); err != nil {
+ printErr("Failed to flush visitedStore: %v", err)
+ }
+ }
+
+ // 3. Re-index data_to_index.txt based on IndexRefreshInterval
+ //startPeriodicIndexing(outFile, config.IndexRefreshInterval)
+
+ printDebug("Crawl + index refresh completed.")
+}
+
+// readDomainsCSV returns a slice of (rank,domain) from a local CSV file
+func readDomainsCSV(csvPath string) ([][2]string, error) {
+ f, err := os.Open(csvPath)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ var result [][2]string
+ scanner := bufio.NewScanner(f)
+ // Skip header line
+ scanner.Scan()
+
+ for scanner.Scan() {
+ line := scanner.Text()
+ // Split by commas, not tabs
+ fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)
+ if len(fields) < 2 {
+ printDebug("Skipping malformed line: %s", line)
+ continue
+ }
+ // Remove quotes around fields, if present
+ rank := strings.Trim(fields[0], `"`)
+ domain := strings.Trim(fields[1], `"`)
+ result = append(result, [2]string{rank, domain})
+ }
+ return result, scanner.Err()
+}
+
+// crawlDomainsToFile does an async pipeline:
+// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
+// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
+//
+// Now, instead of writing to a file, we directly index each result into Bleve via indexDocImmediately(...).
+func crawlDomainsToFile(domains [][2]string, maxPages int) error {
+ var mu sync.Mutex // Used if needed to protect shared data. (Here mainly for visitedStore.)
+
+ // Prepare channels
+ standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
+ chromeCh := make(chan [2]string, 1000)
+
+ // 1) Spawn standard workers
+ var wgStandard sync.WaitGroup
+ for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
+ wgStandard.Add(1)
+ go func() {
+ defer wgStandard.Done()
+
+ for dom := range standardCh {
+ rank := dom[0]
+ domainName := dom[1]
+ if domainName == "" {
+ continue
+ }
+ fullURL := "https://" + domainName
+
+ // Mark visited so we don't re-crawl duplicates
+ mu.Lock()
+ added, err := visitedStore.MarkVisited(fullURL)
+ mu.Unlock()
+
+ if err != nil {
+ printErr("MarkVisited error for %s: %v", fullURL, err)
+ continue
+ }
+ if !added {
+ // Already visited, skip
+ continue
+ }
+
+ // 2. Standard extraction
+ userAgent, _ := GetUserAgent("crawler-std")
+ title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
+
+ // If missing, push to Chrome queue
+ if title == "" || desc == "" {
+ chromeCh <- dom
+ continue
+ }
+
+ // 3. Directly index
+ err = indexDocImmediately(fullURL, title, keywords, desc, rank)
+ if err != nil {
+ printErr("Index error for %s: %v", fullURL, err)
+ }
+ }
+ }()
+ }
+
+ // 2) Spawn chrome workers
+ var wgChrome sync.WaitGroup
+ for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
+ wgChrome.Add(1)
+ go func() {
+ defer wgChrome.Done()
+
+ for dom := range chromeCh {
+ rank := dom[0]
+ domainName := dom[1]
+ if domainName == "" {
+ continue
+ }
+ fullURL := "https://" + domainName
+
+ // 3. Chromedp fallback extraction
+ userAgent, _ := GetUserAgent("crawler-chrome")
+ title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
+ if title == "" || desc == "" {
+ printDebug("Skipping %s: unable to get title/desc data", fullURL) // Here is print for all domains that fail to be crawled
+ continue
+ }
+
+ // 4. Directly index the doc
+ err := indexDocImmediately(fullURL, title, keywords, desc, rank)
+ if err != nil {
+ printErr("Index error for %s: %v", fullURL, err)
+ }
+ }
+ }()
+ }
+
+ // Feed domains into standardCh
+ go func() {
+ for _, dom := range domains {
+ standardCh <- dom
+ }
+ close(standardCh)
+ }()
+
+ // Wait for standard workers to finish, then close chromeCh
+ go func() {
+ wgStandard.Wait()
+ close(chromeCh)
+ }()
+
+ // Wait for chrome workers to finish
+ wgChrome.Wait()
+
+ // Flush visitedStore
+ if visitedStore != nil {
+ if err := visitedStore.Flush(); err != nil {
+ printErr("visitedStore flush error: %v", err)
+ }
+ }
+
+ return nil
+}
diff --git a/files.go b/files.go
index 1755143..d0c1ff1 100755
--- a/files.go
+++ b/files.go
@@ -56,7 +56,7 @@ func handleFileSearch(w http.ResponseWriter, settings UserSettings, query string
"Category": "all",
"Sort": "seed",
"Page": page,
- "HasPrevPage": page > 1,
+ "HasPrevPage": page >= 1,
"HasNextPage": len(combinedResults) > 0,
"LanguageOptions": languageOptions,
"CurrentLang": settings.SearchLanguage,
diff --git a/forums.go b/forums.go
index 973c070..bd57e55 100755
--- a/forums.go
+++ b/forums.go
@@ -118,7 +118,8 @@ func handleForumsSearch(w http.ResponseWriter, settings UserSettings, query stri
"Page": page,
"Fetched": fmt.Sprintf("%.2f %s", elapsedTime.Seconds(), Translate("seconds")), // Time for fetching results
"HasPrevPage": page > 1,
- "HasNextPage": len(results) == 25, // Assuming 25 results per page
+ "HasNextPage": len(results) >= 25,
+ "NoResults": len(results) == 0,
"LanguageOptions": languageOptions,
"CurrentLang": settings.SearchLanguage,
"Theme": settings.Theme,
diff --git a/get-domains-csv.go b/get-domains-csv.go
new file mode 100644
index 0000000..8d931f9
--- /dev/null
+++ b/get-domains-csv.go
@@ -0,0 +1,118 @@
+package main
+
+import (
+ "archive/zip"
+ "fmt"
+ "io"
+ "net/http"
+ "os"
+ "path/filepath"
+)
+
+func downloadAndSetupDomainsCSV() error {
+ targetFilePath := filepath.Join(config.DriveCache.Path, "domains.csv")
+
+ // Check if domains.csv already exists
+ if _, err := os.Stat(targetFilePath); err == nil {
+ printDebug("domains.csv already exists at %s", targetFilePath)
+ return nil
+ }
+
+ downloadURL := "https://www.domcop.com/files/top/top10milliondomains.csv.zip"
+ zipFilePath := filepath.Join(config.DriveCache.Path, "top10milliondomains.csv.zip")
+
+ // Download the file
+ printDebug("Downloading file from %s", downloadURL)
+ resp, err := http.Get(downloadURL)
+ if err != nil {
+ return fmt.Errorf("failed to download file: %v", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("failed to download file: received status code %d", resp.StatusCode)
+ }
+
+ // Create the zip file locally
+ zipFile, err := os.Create(zipFilePath)
+ if err != nil {
+ return fmt.Errorf("failed to create local zip file: %v", err)
+ }
+ defer zipFile.Close()
+
+ _, err = io.Copy(zipFile, resp.Body)
+ if err != nil {
+ return fmt.Errorf("failed to write downloaded zip file: %v", err)
+ }
+
+ // Unzip the file
+ printDebug("Unzipping file %s", zipFilePath)
+ if err := unzipFile(zipFilePath, config.DriveCache.Path); err != nil {
+ return fmt.Errorf("failed to unzip file: %v", err)
+ }
+
+ // Find the .csv file and rename/move it to domains.csv
+ csvFound := false
+ dirEntries, err := os.ReadDir(config.DriveCache.Path)
+ if err != nil {
+ return fmt.Errorf("failed to read directory: %v", err)
+ }
+
+ for _, entry := range dirEntries {
+ if !entry.IsDir() && filepath.Ext(entry.Name()) == ".csv" {
+ csvPath := filepath.Join(config.DriveCache.Path, entry.Name())
+ if err := os.Rename(csvPath, targetFilePath); err != nil {
+ return fmt.Errorf("failed to move %s to %s: %v", csvPath, targetFilePath, err)
+ }
+ csvFound = true
+ break
+ }
+ }
+
+ if !csvFound {
+ return fmt.Errorf("no .csv file found in the downloaded archive")
+ }
+
+ // Clean up zip file
+ if err := os.Remove(zipFilePath); err != nil {
+ printWarn("failed to remove zip file %s: %v", zipFilePath, err)
+ }
+
+ printDebug("domains.csv successfully downloaded and placed at %s", targetFilePath)
+ return nil
+}
+
+func unzipFile(zipFile, destDir string) error {
+ reader, err := zip.OpenReader(zipFile)
+ if err != nil {
+ return err
+ }
+ defer reader.Close()
+
+ for _, file := range reader.File {
+ filePath := filepath.Join(destDir, file.Name)
+
+ if file.FileInfo().IsDir() {
+ os.MkdirAll(filePath, os.ModePerm)
+ continue
+ }
+
+ srcFile, err := file.Open()
+ if err != nil {
+ return err
+ }
+ defer srcFile.Close()
+
+ destFile, err := os.Create(filePath)
+ if err != nil {
+ return err
+ }
+ defer destFile.Close()
+
+ if _, err := io.Copy(destFile, srcFile); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
diff --git a/go.mod b/go.mod
index 63599f8..f7d89ad 100644
--- a/go.mod
+++ b/go.mod
@@ -1,9 +1,11 @@
-module searchengine
+module qgato
-go 1.18
+go 1.23
+
+toolchain go1.23.4
require (
- github.com/PuerkitoBio/goquery v1.9.1 // direct
+ github.com/PuerkitoBio/goquery v1.10.0 // direct
github.com/chai2010/webp v1.1.1
github.com/leonelquinteros/gotext v1.7.0
github.com/shirou/gopsutil v3.21.11+incompatible
@@ -12,10 +14,55 @@ require (
)
require (
- github.com/andybalholm/cascadia v1.3.2 // indirect
+ github.com/blevesearch/bleve/v2 v2.4.4
+ github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb
+ github.com/chromedp/chromedp v0.11.2
+ github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f
+ golang.org/x/net v0.33.0
+)
+
+require (
+ github.com/RoaringBitmap/roaring v1.9.4 // indirect
+ github.com/andybalholm/cascadia v1.3.3 // indirect
+ github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
+ github.com/bits-and-blooms/bitset v1.20.0 // indirect
+ github.com/blevesearch/bleve_index_api v1.2.0 // indirect
+ github.com/blevesearch/geo v0.1.20 // indirect
+ github.com/blevesearch/go-faiss v1.0.24 // indirect
+ github.com/blevesearch/go-porterstemmer v1.0.3 // indirect
+ github.com/blevesearch/gtreap v0.1.1 // indirect
+ github.com/blevesearch/mmap-go v1.0.4 // indirect
+ github.com/blevesearch/scorch_segment_api/v2 v2.3.0 // indirect
+ github.com/blevesearch/segment v0.9.1 // indirect
+ github.com/blevesearch/snowballstem v0.9.0 // indirect
+ github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect
+ github.com/blevesearch/vellum v1.1.0 // indirect
+ github.com/blevesearch/zapx/v11 v11.3.10 // indirect
+ github.com/blevesearch/zapx/v12 v12.3.10 // indirect
+ github.com/blevesearch/zapx/v13 v13.3.10 // indirect
+ github.com/blevesearch/zapx/v14 v14.3.10 // indirect
+ github.com/blevesearch/zapx/v15 v15.3.17 // indirect
+ github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect
+ github.com/chromedp/sysutil v1.1.0 // indirect
github.com/go-ole/go-ole v1.3.0 // indirect
+ github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
+ github.com/gobwas/httphead v0.1.0 // indirect
+ github.com/gobwas/pool v0.2.1 // indirect
+ github.com/gobwas/ws v1.4.0 // indirect
+ github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
+ github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect
+ github.com/golang/protobuf v1.5.4 // indirect
+ github.com/golang/snappy v0.0.4 // indirect
+ github.com/josharian/intern v1.0.0 // indirect
+ github.com/json-iterator/go v1.1.12 // indirect
+ github.com/mailru/easyjson v0.7.7 // indirect
+ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
+ github.com/modern-go/reflect2 v1.0.2 // indirect
+ github.com/mschoch/smat v0.2.0 // indirect
github.com/stretchr/testify v1.9.0 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
- golang.org/x/net v0.30.0 // indirect
- golang.org/x/sys v0.26.0 // indirect
+ go.etcd.io/bbolt v1.3.11 // indirect
+ golang.org/x/sys v0.28.0 // indirect
+ golang.org/x/text v0.21.0 // indirect
+ google.golang.org/protobuf v1.36.0 // indirect
)
diff --git a/go.sum b/go.sum
index 962a1b8..66cede6 100644
--- a/go.sum
+++ b/go.sum
@@ -1,39 +1,154 @@
-github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI=
-github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY=
-github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
-github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
+github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
+github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
+github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ=
+github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
+github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
+github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
+github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
+github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
+github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
+github.com/bits-and-blooms/bitset v1.20.0 h1:2F+rfL86jE2d/bmw7OhqUg2Sj/1rURkBn3MdfoPyRVU=
+github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
+github.com/blevesearch/bleve/v2 v2.4.4 h1:RwwLGjUm54SwyyykbrZs4vc1qjzYic4ZnAnY9TwNl60=
+github.com/blevesearch/bleve/v2 v2.4.4/go.mod h1:fa2Eo6DP7JR+dMFpQe+WiZXINKSunh7WBtlDGbolKXk=
+github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo=
+github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
+github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
+github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
+github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
+github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
+github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
+github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M=
+github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y=
+github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk=
+github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
+github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
+github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw=
+github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc=
+github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
+github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
+github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s=
+github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs=
+github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A=
+github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ=
+github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w=
+github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y=
+github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk=
+github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ=
+github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s=
+github.com/blevesearch/zapx/v12 v12.3.10/go.mod h1:0yeZg6JhaGxITlsS5co73aqPtM04+ycnI6D1v0mhbCs=
+github.com/blevesearch/zapx/v13 v13.3.10 h1:0KY9tuxg06rXxOZHg3DwPJBjniSlqEgVpxIqMGahDE8=
+github.com/blevesearch/zapx/v13 v13.3.10/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk=
+github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz77pSwwKU=
+github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns=
+github.com/blevesearch/zapx/v15 v15.3.17 h1:NkkMI98pYLq/uHnB6YWcITrrLpCVyvZ9iP+AyfpW1Ys=
+github.com/blevesearch/zapx/v15 v15.3.17/go.mod h1:vXRQzJJvlGVCdmOD5hg7t7JdjUT5DmDPhsAfjvtzIq8=
+github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5YgrzCeK3M1QwvZIpxYhChkXp7/L0RhDYsxXoE=
+github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI=
github.com/chai2010/webp v1.1.1 h1:jTRmEccAJ4MGrhFOrPMpNGIJ/eybIgwKpcACsrTEapk=
github.com/chai2010/webp v1.1.1/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU=
+github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb h1:noKVm2SsG4v0Yd0lHNtFYc9EUxIVvrr4kJ6hM8wvIYU=
+github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb/go.mod h1:4XqMl3iIW08jtieURWL6Tt5924w21pxirC6th662XUM=
+github.com/chromedp/chromedp v0.11.2 h1:ZRHTh7DjbNTlfIv3NFTbB7eVeu5XCNkgrpcGSpn2oX0=
+github.com/chromedp/chromedp v0.11.2/go.mod h1:lr8dFRLKsdTTWb75C/Ttol2vnBKOSnt0BW8R9Xaupi8=
+github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM=
+github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
+github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
+github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
+github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI=
+github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w=
+github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
+github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
+github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
+github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
+github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
+github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
+github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
+github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
+github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I=
+github.com/golang/geo v0.0.0-20230421003525-6adc56603217/go.mod h1:8wI0hitZ3a1IxZfeH3/5I97CI8i5cLGsYe7xNhQGs9U=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
+github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
+github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
+github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
+github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
+github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8=
github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw=
+github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
+github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
+github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
+github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
+github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
+github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
+github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
+github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg=
+github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0=
+github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI=
github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
+go.etcd.io/bbolt v1.3.11 h1:yGEzV1wPz2yVCLsD8ZAiGHhHVlczyC9d1rP43/VCRJ0=
+go.etcd.io/bbolt v1.3.11/go.mod h1:dksAq7YMXoljX0xu6VF5DMZGbhYYoLUalEiSySYAS4I=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc=
+golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
+golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
+golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/image v0.21.0 h1:c5qV36ajHpdj4Qi0GnE0jUc/yuo33OLFaa0d+crTD5s=
golang.org/x/image v0.21.0/go.mod h1:vUbsLavqK/W303ZroQQVKQ+Af3Yl6Uz1Ppu5J/cLz78=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
-golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
-golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4=
-golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
+golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
+golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
+golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
+golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
+golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
+golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -42,23 +157,44 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
-golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
+golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
-golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
+golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU=
+golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
+golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY=
+golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
+golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
+golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58=
+golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/protobuf v1.36.0 h1:mjIs9gYtt56AzC4ZaffQuh88TZurBGhIJMBZGSxNerQ=
+google.golang.org/protobuf v1.36.0/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/indexer.go b/indexer.go
new file mode 100644
index 0000000..c8cf6fe
--- /dev/null
+++ b/indexer.go
@@ -0,0 +1,320 @@
+package main
+
+import (
+ "bufio"
+ "fmt"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "sync"
+
+ "github.com/blevesearch/bleve/v2"
+ "golang.org/x/net/publicsuffix"
+)
+
+// Document represents a single document to be indexed.
+type Document struct {
+ ID string `json:"id"`
+ Link string `json:"link"`
+ Title string `json:"title"`
+ Tags string `json:"tags"`
+ Description string `json:"description"`
+ Popularity int64 `json:"popularity"`
+}
+
+var (
+ // Global Bleve index handle
+ bleveIndex bleve.Index
+ docBuffer []Document
+ docBufferMu sync.Mutex
+)
+
+// // startPeriodicIndexing refreshes the index from a file periodically
+// func startPeriodicIndexing(filePath string, interval time.Duration) {
+// go func() {
+// for {
+// printDebug("Refreshing index from %s", filePath)
+// if err := IndexFile(filePath); err != nil {
+// printErr("Failed to refresh index: %v", err)
+// }
+// time.Sleep(interval)
+// }
+// }()
+// }
+
+// indexDocImmediately indexes a single document into the Bleve index.
+func indexDocImmediately(link, title, tags, desc, rank string) error {
+ pop, _ := strconv.ParseInt(rank, 10, 64)
+ normalized := normalizeDomain(link)
+
+ doc := Document{
+ ID: normalized,
+ Link: link,
+ Title: title,
+ Tags: tags,
+ Description: desc,
+ Popularity: pop,
+ }
+
+ // Insert directly into the Bleve index
+ err := bleveIndex.Index(doc.ID, map[string]interface{}{
+ "title": doc.Title,
+ "description": doc.Description,
+ "link": doc.Link,
+ "tags": doc.Tags,
+ "popularity": doc.Popularity,
+ })
+ if err != nil {
+ return fmt.Errorf("failed to index doc %s: %v", link, err)
+ }
+ return nil
+}
+
+// // StartBatchIndexing spawns a goroutine that flushes the buffer every interval.
+// func StartBatchIndexing() {
+// go func() {
+// ticker := time.NewTicker(config.IndexRefreshInterval)
+// defer ticker.Stop()
+
+// for range ticker.C {
+// flushDocBuffer()
+// }
+// }()
+// }
+
+func flushDocBuffer() {
+ docBufferMu.Lock()
+ defer docBufferMu.Unlock()
+
+ if len(docBuffer) == 0 {
+ return
+ }
+
+ batch := bleveIndex.NewBatch()
+ for _, doc := range docBuffer {
+ err := batch.Index(doc.ID, map[string]interface{}{
+ "title": doc.Title,
+ "description": doc.Description,
+ "link": doc.Link,
+ "tags": doc.Tags,
+ "popularity": doc.Popularity,
+ })
+ if err != nil {
+ printErr("batch index error for %s: %v", doc.Link, err)
+ }
+ }
+ // Attempt to commit the batch
+ if err := bleveIndex.Batch(batch); err != nil {
+ printErr("error committing batch: %v", err)
+ }
+
+ // Clear the buffer
+ docBuffer = docBuffer[:0]
+}
+
+// indexDocBatch queues a single document into memory, which gets flushed by the ticker.
+func indexDocBatch(link, title, tags, desc, rank string) error {
+ pop, _ := strconv.ParseInt(rank, 10, 64)
+ normalized := normalizeDomain(link)
+
+ doc := Document{
+ ID: normalized,
+ Link: link,
+ Title: title,
+ Tags: tags,
+ Description: desc,
+ Popularity: pop,
+ }
+
+ docBufferMu.Lock()
+ docBuffer = append(docBuffer, doc)
+
+ // Optional: if we exceed config.IndexBatchSize, flush immediately
+ if len(docBuffer) >= config.IndexBatchSize {
+ go func() {
+ // flush in a separate goroutine to avoid blocking
+ flushDocBuffer()
+ }()
+ }
+ docBufferMu.Unlock()
+
+ return nil
+}
+
+// InitIndex ensures that the Bleve index is created or opened.
+func InitIndex() error {
+ idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve"))
+ if err == bleve.ErrorIndexPathDoesNotExist {
+ // Index doesn't exist, create a new one
+ mapping := bleve.NewIndexMapping()
+
+ docMapping := bleve.NewDocumentMapping()
+
+ // Text fields
+ titleFieldMapping := bleve.NewTextFieldMapping()
+ titleFieldMapping.Analyzer = "standard"
+ docMapping.AddFieldMappingsAt("title", titleFieldMapping)
+
+ descFieldMapping := bleve.NewTextFieldMapping()
+ descFieldMapping.Analyzer = "standard"
+ docMapping.AddFieldMappingsAt("description", descFieldMapping)
+
+ tagFieldMapping := bleve.NewTextFieldMapping()
+ tagFieldMapping.Analyzer = "standard"
+ docMapping.AddFieldMappingsAt("tags", tagFieldMapping)
+
+ // Numeric field for popularity
+ popularityMapping := bleve.NewNumericFieldMapping()
+ docMapping.AddFieldMappingsAt("popularity", popularityMapping)
+
+ mapping.AddDocumentMapping("Document", docMapping)
+
+ idx, err = bleve.New(filepath.Join(config.DriveCache.Path, "index.bleve"), mapping)
+ if err != nil {
+ return fmt.Errorf("failed to create index: %v", err)
+ }
+ } else if err != nil {
+ return fmt.Errorf("failed to open index: %v", err)
+ }
+
+ bleveIndex = idx
+ return nil
+}
+
+func normalizeDomain(rawURL string) string {
+ parsed, err := url.Parse(rawURL)
+ if err != nil {
+ return rawURL
+ }
+ domain, err := publicsuffix.EffectiveTLDPlusOne(parsed.Hostname())
+ if err != nil {
+ return parsed.Hostname() // fallback
+ }
+ return domain
+}
+
+// IndexFile reads a file line-by-line and indexes each line as a document.
+func IndexFile(filePath string) error {
+ file, err := os.Open(filePath)
+ if err != nil {
+ return fmt.Errorf("unable to open file for indexing: %v", err)
+ }
+ defer file.Close()
+
+ scanner := bufio.NewScanner(file)
+ batch := bleveIndex.NewBatch()
+
+ // Map to track normalized domains we’ve already indexed
+ indexedDomains := make(map[string]bool)
+
+ for scanner.Scan() {
+ line := scanner.Text()
+
+ // link|title|tags|description|popularity
+ parts := strings.SplitN(line, "|", 5)
+ if len(parts) < 5 {
+ continue
+ }
+
+ // Normalize domain part so duplicates share the same “key”
+ normalized := normalizeDomain(parts[0])
+ popularity, _ := strconv.ParseInt(parts[4], 10, 64)
+
+ if indexedDomains[normalized] {
+ continue
+ }
+
+ doc := Document{
+ ID: normalized,
+ Link: parts[0],
+ Title: parts[1],
+ Tags: parts[2],
+ Description: parts[3],
+ Popularity: popularity,
+ }
+
+ err := batch.Index(doc.ID, map[string]interface{}{
+ "title": doc.Title,
+ "description": doc.Description,
+ "link": doc.Link,
+ "tags": doc.Tags,
+ "popularity": doc.Popularity,
+ })
+ if err != nil {
+ return fmt.Errorf("failed to index document: %v", err)
+ }
+
+ indexedDomains[normalized] = true
+ }
+
+ if err := bleveIndex.Batch(batch); err != nil {
+ return fmt.Errorf("error committing batch: %v", err)
+ }
+
+ if err := scanner.Err(); err != nil {
+ return fmt.Errorf("error reading file: %v", err)
+ }
+
+ printDebug("Indexed %d unique normalized domains from %s", len(indexedDomains), filePath)
+ return nil
+}
+
+// SearchIndex performs a full-text search on the indexed data.
+func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
+ // Check if the indexer is enabled
+ if !config.IndexerEnabled {
+ return nil, fmt.Errorf("indexer is disabled")
+ }
+
+ exactMatch := bleve.NewMatchQuery(queryStr) // Exact match
+ fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match
+ fuzzyMatch.Fuzziness = 2
+ prefixMatch := bleve.NewPrefixQuery(queryStr) // Prefix match
+
+ query := bleve.NewDisjunctionQuery(exactMatch, fuzzyMatch, prefixMatch)
+
+ req := bleve.NewSearchRequest(query)
+ req.Fields = []string{"title", "description", "link", "tags", "popularity"}
+
+ // Pagination
+ req.Size = pageSize
+ req.From = (page - 1) * pageSize
+
+ // Sort primarily by relevance (score), then by popularity descending
+ req.SortBy([]string{"-_score", "-popularity"})
+
+ res, err := bleveIndex.Search(req)
+ if err != nil {
+ return nil, fmt.Errorf("search error: %v", err)
+ }
+
+ var docs []Document
+ for _, hit := range res.Hits {
+ title := fmt.Sprintf("%v", hit.Fields["title"])
+ description := fmt.Sprintf("%v", hit.Fields["description"])
+ link := fmt.Sprintf("%v", hit.Fields["link"])
+ tags := fmt.Sprintf("%v", hit.Fields["tags"])
+ popularity := int64(0)
+
+ if pop, ok := hit.Fields["popularity"].(float64); ok {
+ popularity = int64(pop)
+ }
+
+ if link == "