From 047cccd19f7d102772508537a81485e42c4e74c3 Mon Sep 17 00:00:00 2001 From: partisan Date: Sun, 29 Dec 2024 22:54:55 +0100 Subject: [PATCH 1/9] added website crawling and indexing crawled results --- cache-images.go | 42 +++++---- cache.go | 2 +- config.go | 25 +++-- crawler.go | 224 +++++++++++++++++++++++++++++++++++++++++++++ get-domains-csv.go | 118 ++++++++++++++++++++++++ go.mod | 46 ++++++++-- go.sum | 123 ++++++++++++++++++++++--- indexer.go | 198 +++++++++++++++++++++++++++++++++++++++ init.go | 25 +++++ text.go | 113 ++++++++++++----------- 10 files changed, 819 insertions(+), 97 deletions(-) create mode 100644 crawler.go create mode 100644 get-domains-csv.go create mode 100644 indexer.go diff --git a/cache-images.go b/cache-images.go index 16d686e..4e551cd 100644 --- a/cache-images.go +++ b/cache-images.go @@ -24,15 +24,15 @@ import ( ) var ( - cachingImages = make(map[string]*sync.Mutex) - cachingImagesMu sync.Mutex - // cachingSemaphore = make(chan struct{}, 100) // Limit to concurrent downloads + cachingImages = make(map[string]*sync.Mutex) + cachingImagesMu sync.Mutex + cachingSemaphore = make(chan struct{}, 100) invalidImageIDs = make(map[string]struct{}) invalidImageIDsMu sync.Mutex - imageURLMap = make(map[string]string) // mapping from imageID_type to imageURL - imageURLMapMu sync.RWMutex // mutex for thread-safe access + imageURLMap = make(map[string]string) + imageURLMapMu sync.RWMutex ) func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error) { @@ -49,7 +49,13 @@ func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error filename = fmt.Sprintf("%s_full.webp", imageID) } - cachedImagePath := filepath.Join(config.DriveCache.Path, filename) + // Make sure we store inside: config.DriveCache.Path / images + imageCacheDir := filepath.Join(config.DriveCache.Path, "images") + if err := os.MkdirAll(imageCacheDir, 0755); err != nil { + return "", false, fmt.Errorf("couldn't create images folder: %v", err) + } + + cachedImagePath := filepath.Join(imageCacheDir, filename) tempImagePath := cachedImagePath + ".tmp" // Check if the image is already cached @@ -73,9 +79,8 @@ func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error return cachedImagePath, true, nil } - // // Limit max concurrent downloads - // cachingSemaphore <- struct{}{} // Acquire a token - // defer func() { <-cachingSemaphore }() // Release the token + cachingSemaphore <- struct{}{} + defer func() { <-cachingSemaphore }() // Create a custom http.Client that skips SSL certificate verification client := &http.Client{ @@ -217,7 +222,8 @@ func handleImageServe(w http.ResponseWriter, r *http.Request) { imageType = parts[1] filename := fmt.Sprintf("%s_%s.webp", imageID, imageType) - cachedImagePath := filepath.Join(config.DriveCache.Path, filename) + // Adjust to read from config.DriveCache.Path / images + cachedImagePath := filepath.Join(config.DriveCache.Path, "images", filename) if hasExtension && imageType == "thumb" { // Requesting cached image (thumbnail or full) @@ -329,7 +335,7 @@ func handleImageStatus(w http.ResponseWriter, r *http.Request) { // Check thumbnail first for _, ext := range extensions { thumbFilename := fmt.Sprintf("%s_thumb.%s", id, ext) - thumbPath := filepath.Join(config.DriveCache.Path, thumbFilename) + thumbPath := filepath.Join(config.DriveCache.Path, "images", thumbFilename) if _, err := os.Stat(thumbPath); err == nil { statusMap[id] = fmt.Sprintf("/image/%s_thumb.%s", id, ext) @@ -342,7 +348,7 @@ func handleImageStatus(w http.ResponseWriter, r *http.Request) { if !imageReady { for _, ext := range extensions { fullFilename := fmt.Sprintf("%s_full.%s", id, ext) - fullPath := filepath.Join(config.DriveCache.Path, fullFilename) + fullPath := filepath.Join(config.DriveCache.Path, "images", fullFilename) if _, err := os.Stat(fullPath); err == nil { statusMap[id] = fmt.Sprintf("/image/%s_full.%s", id, ext) @@ -447,7 +453,9 @@ func cleanExpiredCachedImages() { } func cleanupCache() { - files, err := os.ReadDir(config.DriveCache.Path) + // Read from: config.DriveCache.Path / images + imageCacheDir := filepath.Join(config.DriveCache.Path, "images") + files, err := os.ReadDir(imageCacheDir) if err != nil { printErr("Failed to read DriveCache directory: %v", err) return @@ -462,19 +470,17 @@ func cleanupCache() { continue } - filePath := filepath.Join(config.DriveCache.Path, file.Name()) + filePath := filepath.Join(imageCacheDir, file.Name()) - // Check for expired files based on modification time if config.DriveCache.Duration > 0 && time.Since(info.ModTime()) > config.DriveCache.Duration { if err := os.Remove(filePath); err == nil { printDebug("Removed expired cache file: %s", filePath) } else { printErr("Failed to remove expired cache file: %s", filePath) } - continue // Skip adding this file to the list + continue } - // Accumulate total size and store file info for potential deletion totalSize += uint64(info.Size()) fileInfos = append(fileInfos, info) } @@ -491,7 +497,7 @@ func cleanupCache() { break } - filePath := filepath.Join(config.DriveCache.Path, info.Name()) + filePath := filepath.Join(imageCacheDir, info.Name()) fileSize := uint64(info.Size()) if err := os.Remove(filePath); err == nil { diff --git a/cache.go b/cache.go index b5ad880..ac2902d 100644 --- a/cache.go +++ b/cache.go @@ -162,7 +162,7 @@ func (rc *ResultsCache) keyToString(key CacheKey) string { // checkAndCleanCache removes items if memory usage exceeds the limit. func (rc *ResultsCache) checkAndCleanCache() { - for rc.currentMemoryUsage() > config.RamCache.MaxUsageBytes { + if rc.currentMemoryUsage() > config.RamCache.MaxUsageBytes { rc.cleanOldestItems() } } diff --git a/config.go b/config.go index c3aec6b..2e5d805 100644 --- a/config.go +++ b/config.go @@ -30,6 +30,7 @@ type Config struct { Domain string // Added NodesEnabled bool // Added CrawlerEnabled bool // Added + IndexerEnabled bool // Added WebsiteEnabled bool // Added RamCacheEnabled bool DriveCacheEnabled bool // Added @@ -46,6 +47,7 @@ var defaultConfig = Config{ AuthCode: generateStrongRandomString(64), NodesEnabled: false, CrawlerEnabled: true, + IndexerEnabled: false, WebsiteEnabled: true, RamCacheEnabled: true, DriveCacheEnabled: false, @@ -105,6 +107,15 @@ func createConfig() error { config.Domain = defaultConfig.Domain } + // printMessage("Use Indexer? (YES/no): ") + // indexerChoice, _ := reader.ReadString('\n') + // indexerChoice = strings.TrimSpace(strings.ToLower(indexerChoice)) + // if indexerChoice == "no" { + // config.IndexerEnabled = false + // } else { + // config.IndexerEnabled = true + // } + // Cache settings printMessage("Would you like to configure Cache settings (yes/NO): ") configureCache, _ := reader.ReadString('\n') @@ -181,7 +192,7 @@ func createConfig() error { } else { config.DriveCache.MaxUsageBytes = parseMaxUsageDrive(driveMaxUsage, drivePath) if config.DriveCache.MaxUsageBytes == 0 { - printWarn("Invalid DriveCache max usage, using default (1 TiB).") + printWarn("Invalid DriveCache max usage, using default.") config.DriveCache.MaxUsageBytes = defaultConfig.DriveCache.MaxUsageBytes } } @@ -201,13 +212,6 @@ func createConfig() error { printMessage("Generated connection code: %s\n", config.AuthCode) } - // Set other default values - config.NodesEnabled = defaultConfig.NodesEnabled - config.CrawlerEnabled = defaultConfig.CrawlerEnabled - config.WebsiteEnabled = defaultConfig.WebsiteEnabled - config.LogLevel = defaultConfig.LogLevel - - // Save configuration to file saveConfig(config) printInfo("Configuration saved successfully.") return nil @@ -232,6 +236,7 @@ func saveConfig(config Config) { featuresSec := cfg.Section("Features") featuresSec.Key("Nodes").SetValue(strconv.FormatBool(config.NodesEnabled)) featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled)) + featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled)) featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled)) featuresSec.Key("RamCache").SetValue(strconv.FormatBool(config.RamCacheEnabled)) featuresSec.Key("DriveCache").SetValue(strconv.FormatBool(config.DriveCacheEnabled)) @@ -273,6 +278,7 @@ func loadConfig() Config { // Features nodesEnabled, _ := cfg.Section("Features").Key("Nodes").Bool() crawlerEnabled, _ := cfg.Section("Features").Key("Crawler").Bool() + indexerEnabled, _ := cfg.Section("Features").Key("Indexer").Bool() websiteEnabled, _ := cfg.Section("Features").Key("Website").Bool() ramCacheEnabled, _ := cfg.Section("Features").Key("RamCache").Bool() driveCacheEnabled, _ := cfg.Section("Features").Key("DriveCache").Bool() @@ -294,10 +300,11 @@ func loadConfig() Config { Port: port, Domain: domain, LogLevel: logLevel, - AuthCode: authCode, // Assign AuthCode here + AuthCode: authCode, Peers: peers, NodesEnabled: nodesEnabled, CrawlerEnabled: crawlerEnabled, + IndexerEnabled: indexerEnabled, WebsiteEnabled: websiteEnabled, RamCacheEnabled: ramCacheEnabled, DriveCacheEnabled: driveCacheEnabled, diff --git a/crawler.go b/crawler.go new file mode 100644 index 0000000..bbe3540 --- /dev/null +++ b/crawler.go @@ -0,0 +1,224 @@ +package main + +import ( + "bufio" + "fmt" + "net/http" + "os" + "path/filepath" + "strings" + "time" + + "golang.org/x/net/html" +) + +// webCrawlerInit is called during init on program start +func webCrawlerInit() { + go func() { + // First run immediately + runCrawlerAndIndexer() + + // Then every 24h (adjust as needed) + ticker := time.NewTicker(24 * time.Hour) + for range ticker.C { + runCrawlerAndIndexer() + } + }() +} + +// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes +func runCrawlerAndIndexer() { + // 1. Read domains.csv + domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv")) + if err != nil { + printErr("Error reading domains.csv: %v", err) + return + } + + // 2. Crawl each domain and write results to data_to_index.txt + outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") + if err := crawlDomainsToFile(domains, outFile); err != nil { + printErr("Error crawling domains: %v", err) + return + } + + // 3. Re-index data_to_index.txt + if err := IndexFile(outFile); err != nil { + printErr("Error indexing data_to_index.txt: %v", err) + return + } + + printDebug("Crawl + index refresh completed.") +} + +// readDomainsCSV returns a slice of (rank,domain) from a local CSV file +func readDomainsCSV(csvPath string) ([][2]string, error) { + f, err := os.Open(csvPath) + if err != nil { + return nil, err + } + defer f.Close() + + var result [][2]string + scanner := bufio.NewScanner(f) + // Skip header line + scanner.Scan() + + for scanner.Scan() { + line := scanner.Text() + // Split by commas, not tabs + fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity) + if len(fields) < 2 { + printDebug("Skipping malformed line: %s", line) + continue + } + // Remove quotes around fields, if present + rank := strings.Trim(fields[0], `"`) + domain := strings.Trim(fields[1], `"`) + result = append(result, [2]string{rank, domain}) + } + return result, scanner.Err() +} + +// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile +func crawlDomainsToFile(domains [][2]string, outFile string) error { + // Read existing data_to_index.txt into a map to prevent duplicates + existingEntries := make(map[string]bool) + if _, err := os.Stat(outFile); err == nil { // File exists + file, err := os.Open(outFile) + if err != nil { + return fmt.Errorf("unable to open %s: %v", outFile, err) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + parts := strings.SplitN(line, "|", 5) + if len(parts) >= 1 { + existingEntries[parts[0]] = true // Mark existing domain + } + } + } + + // Open file for writing (truncate if existing) + file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return fmt.Errorf("unable to open %s for writing: %v", outFile, err) + } + defer file.Close() + + for _, d := range domains { + rank := d[0] + domain := d[1] + if domain == "" || existingEntries["https://"+domain] { + continue + } + + fullURL := "https://" + domain + title, desc, keywords := fetchPageMetadata(fullURL) + if title == "" { + title = "Unknown Title" + } + if desc == "" { + desc = "No Description" + } + + // Write unique domain to file + line := fmt.Sprintf("%s|%s|%s|%s|%s\n", + fullURL, + sanitize(title), + sanitize(keywords), + sanitize(desc), + rank, + ) + if _, err := file.WriteString(line); err != nil { + return err + } + + existingEntries[fullURL] = true + } + + return nil +} + +// fetchPageMetadata does a simple GET and parses , meta[name=description], meta[name=keywords] +func fetchPageMetadata(pageURL string) (string, string, string) { + // Generate a User-Agent using your GetUserAgent function + userAgent, err := GetUserAgent("crawler") + if err != nil { + printWarn("Failed to generate User-Agent: %v", err) + return "", "", "" + } + + client := &http.Client{Timeout: 15 * time.Second} + req, err := http.NewRequest("GET", pageURL, nil) + if err != nil { + printWarn("Failed to create request for %s: %v", pageURL, err) + return "", "", "" + } + + // Set the dynamically generated User-Agent + req.Header.Set("User-Agent", userAgent) + + resp, err := client.Do(req) + if err != nil { + printWarn("Failed to GET %s: %v", pageURL, err) + return "", "", "" + } + defer resp.Body.Close() + + // Handle non-200 responses + if resp.StatusCode == 403 || resp.StatusCode == 401 { + printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode) + return "", "", "" + } else if resp.StatusCode < 200 || resp.StatusCode >= 300 { + printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode) + return "", "", "" + } + + // Parse HTML + doc, err := html.Parse(resp.Body) + if err != nil { + printWarn("HTML parse error for %s: %v", pageURL, err) + return "", "", "" + } + + var title, desc, keywords string + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil { + title = n.FirstChild.Data + } + if n.Type == html.ElementNode && n.Data == "meta" { + var nameVal, contentVal string + for _, attr := range n.Attr { + switch strings.ToLower(attr.Key) { + case "name": + nameVal = strings.ToLower(attr.Val) + case "content": + contentVal = attr.Val + } + } + if nameVal == "description" { + desc = contentVal + } else if nameVal == "keywords" { + keywords = contentVal + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) + + return title, desc, keywords +} + +// sanitize is a quick helper to remove newlines/pipes from fields +func sanitize(input string) string { + input = strings.ReplaceAll(input, "|", " ") + input = strings.ReplaceAll(input, "\n", " ") + input = strings.TrimSpace(input) + return input +} diff --git a/get-domains-csv.go b/get-domains-csv.go new file mode 100644 index 0000000..8d931f9 --- /dev/null +++ b/get-domains-csv.go @@ -0,0 +1,118 @@ +package main + +import ( + "archive/zip" + "fmt" + "io" + "net/http" + "os" + "path/filepath" +) + +func downloadAndSetupDomainsCSV() error { + targetFilePath := filepath.Join(config.DriveCache.Path, "domains.csv") + + // Check if domains.csv already exists + if _, err := os.Stat(targetFilePath); err == nil { + printDebug("domains.csv already exists at %s", targetFilePath) + return nil + } + + downloadURL := "https://www.domcop.com/files/top/top10milliondomains.csv.zip" + zipFilePath := filepath.Join(config.DriveCache.Path, "top10milliondomains.csv.zip") + + // Download the file + printDebug("Downloading file from %s", downloadURL) + resp, err := http.Get(downloadURL) + if err != nil { + return fmt.Errorf("failed to download file: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to download file: received status code %d", resp.StatusCode) + } + + // Create the zip file locally + zipFile, err := os.Create(zipFilePath) + if err != nil { + return fmt.Errorf("failed to create local zip file: %v", err) + } + defer zipFile.Close() + + _, err = io.Copy(zipFile, resp.Body) + if err != nil { + return fmt.Errorf("failed to write downloaded zip file: %v", err) + } + + // Unzip the file + printDebug("Unzipping file %s", zipFilePath) + if err := unzipFile(zipFilePath, config.DriveCache.Path); err != nil { + return fmt.Errorf("failed to unzip file: %v", err) + } + + // Find the .csv file and rename/move it to domains.csv + csvFound := false + dirEntries, err := os.ReadDir(config.DriveCache.Path) + if err != nil { + return fmt.Errorf("failed to read directory: %v", err) + } + + for _, entry := range dirEntries { + if !entry.IsDir() && filepath.Ext(entry.Name()) == ".csv" { + csvPath := filepath.Join(config.DriveCache.Path, entry.Name()) + if err := os.Rename(csvPath, targetFilePath); err != nil { + return fmt.Errorf("failed to move %s to %s: %v", csvPath, targetFilePath, err) + } + csvFound = true + break + } + } + + if !csvFound { + return fmt.Errorf("no .csv file found in the downloaded archive") + } + + // Clean up zip file + if err := os.Remove(zipFilePath); err != nil { + printWarn("failed to remove zip file %s: %v", zipFilePath, err) + } + + printDebug("domains.csv successfully downloaded and placed at %s", targetFilePath) + return nil +} + +func unzipFile(zipFile, destDir string) error { + reader, err := zip.OpenReader(zipFile) + if err != nil { + return err + } + defer reader.Close() + + for _, file := range reader.File { + filePath := filepath.Join(destDir, file.Name) + + if file.FileInfo().IsDir() { + os.MkdirAll(filePath, os.ModePerm) + continue + } + + srcFile, err := file.Open() + if err != nil { + return err + } + defer srcFile.Close() + + destFile, err := os.Create(filePath) + if err != nil { + return err + } + defer destFile.Close() + + if _, err := io.Copy(destFile, srcFile); err != nil { + return err + } + } + + return nil +} diff --git a/go.mod b/go.mod index 63599f8..6895586 100644 --- a/go.mod +++ b/go.mod @@ -1,9 +1,11 @@ -module searchengine +module qgato -go 1.18 +go 1.23 + +toolchain go1.23.4 require ( - github.com/PuerkitoBio/goquery v1.9.1 // direct + github.com/PuerkitoBio/goquery v1.10.0 // direct github.com/chai2010/webp v1.1.1 github.com/leonelquinteros/gotext v1.7.0 github.com/shirou/gopsutil v3.21.11+incompatible @@ -12,10 +14,42 @@ require ( ) require ( - github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/blevesearch/bleve/v2 v2.4.4 + golang.org/x/net v0.33.0 +) + +require ( + github.com/RoaringBitmap/roaring v1.9.4 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/bits-and-blooms/bitset v1.20.0 // indirect + github.com/blevesearch/bleve_index_api v1.2.0 // indirect + github.com/blevesearch/geo v0.1.20 // indirect + github.com/blevesearch/go-faiss v1.0.24 // indirect + github.com/blevesearch/go-porterstemmer v1.0.3 // indirect + github.com/blevesearch/gtreap v0.1.1 // indirect + github.com/blevesearch/mmap-go v1.0.4 // indirect + github.com/blevesearch/scorch_segment_api/v2 v2.3.0 // indirect + github.com/blevesearch/segment v0.9.1 // indirect + github.com/blevesearch/snowballstem v0.9.0 // indirect + github.com/blevesearch/upsidedown_store_api v1.0.2 // indirect + github.com/blevesearch/vellum v1.1.0 // indirect + github.com/blevesearch/zapx/v11 v11.3.10 // indirect + github.com/blevesearch/zapx/v12 v12.3.10 // indirect + github.com/blevesearch/zapx/v13 v13.3.10 // indirect + github.com/blevesearch/zapx/v14 v14.3.10 // indirect + github.com/blevesearch/zapx/v15 v15.3.17 // indirect + github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect github.com/go-ole/go-ole v1.3.0 // indirect + github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/golang/snappy v0.0.4 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/mschoch/smat v0.2.0 // indirect github.com/stretchr/testify v1.9.0 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect - golang.org/x/net v0.30.0 // indirect - golang.org/x/sys v0.26.0 // indirect + go.etcd.io/bbolt v1.3.11 // indirect + golang.org/x/sys v0.28.0 // indirect + google.golang.org/protobuf v1.36.0 // indirect ) diff --git a/go.sum b/go.sum index 962a1b8..f3f643b 100644 --- a/go.sum +++ b/go.sum @@ -1,39 +1,121 @@ -github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI= -github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= -github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= -github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= +github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= +github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= +github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv2QzDdQ= +github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bits-and-blooms/bitset v1.20.0 h1:2F+rfL86jE2d/bmw7OhqUg2Sj/1rURkBn3MdfoPyRVU= +github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/blevesearch/bleve/v2 v2.4.4 h1:RwwLGjUm54SwyyykbrZs4vc1qjzYic4ZnAnY9TwNl60= +github.com/blevesearch/bleve/v2 v2.4.4/go.mod h1:fa2Eo6DP7JR+dMFpQe+WiZXINKSunh7WBtlDGbolKXk= +github.com/blevesearch/bleve_index_api v1.2.0 h1:/DXMMWBwx/UmGKM1xDhTwDoJI5yQrG6rqRWPFcOgUVo= +github.com/blevesearch/bleve_index_api v1.2.0/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8= +github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM= +github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w= +github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI= +github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk= +github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo= +github.com/blevesearch/go-porterstemmer v1.0.3/go.mod h1:angGc5Ht+k2xhJdZi511LtmxuEf0OVpvUUNrwmM1P7M= +github.com/blevesearch/gtreap v0.1.1 h1:2JWigFrzDMR+42WGIN/V2p0cUvn4UP3C4Q5nmaZGW8Y= +github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgYICSZ3w0tYk= +github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc= +github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs= +github.com/blevesearch/scorch_segment_api/v2 v2.3.0 h1:vxCjbXAkkEBSb4AB3Iqgr/EJcPyYRsiGxpcvsS8E1Dw= +github.com/blevesearch/scorch_segment_api/v2 v2.3.0/go.mod h1:5y+TgXYSx+xJGaCwSlvy9G/UJBIY5wzvIkhvhBm2ATc= +github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU= +github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw= +github.com/blevesearch/snowballstem v0.9.0 h1:lMQ189YspGP6sXvZQ4WZ+MLawfV8wOmPoD/iWeNXm8s= +github.com/blevesearch/snowballstem v0.9.0/go.mod h1:PivSj3JMc8WuaFkTSRDW2SlrulNWPl4ABg1tC/hlgLs= +github.com/blevesearch/upsidedown_store_api v1.0.2 h1:U53Q6YoWEARVLd1OYNc9kvhBMGZzVrdmaozG2MfoB+A= +github.com/blevesearch/upsidedown_store_api v1.0.2/go.mod h1:M01mh3Gpfy56Ps/UXHjEO/knbqyQ1Oamg8If49gRwrQ= +github.com/blevesearch/vellum v1.1.0 h1:CinkGyIsgVlYf8Y2LUQHvdelgXr6PYuvoDIajq6yR9w= +github.com/blevesearch/vellum v1.1.0/go.mod h1:QgwWryE8ThtNPxtgWJof5ndPfx0/YMBh+W2weHKPw8Y= +github.com/blevesearch/zapx/v11 v11.3.10 h1:hvjgj9tZ9DeIqBCxKhi70TtSZYMdcFn7gDb71Xo/fvk= +github.com/blevesearch/zapx/v11 v11.3.10/go.mod h1:0+gW+FaE48fNxoVtMY5ugtNHHof/PxCqh7CnhYdnMzQ= +github.com/blevesearch/zapx/v12 v12.3.10 h1:yHfj3vXLSYmmsBleJFROXuO08mS3L1qDCdDK81jDl8s= +github.com/blevesearch/zapx/v12 v12.3.10/go.mod h1:0yeZg6JhaGxITlsS5co73aqPtM04+ycnI6D1v0mhbCs= +github.com/blevesearch/zapx/v13 v13.3.10 h1:0KY9tuxg06rXxOZHg3DwPJBjniSlqEgVpxIqMGahDE8= +github.com/blevesearch/zapx/v13 v13.3.10/go.mod h1:w2wjSDQ/WBVeEIvP0fvMJZAzDwqwIEzVPnCPrz93yAk= +github.com/blevesearch/zapx/v14 v14.3.10 h1:SG6xlsL+W6YjhX5N3aEiL/2tcWh3DO75Bnz77pSwwKU= +github.com/blevesearch/zapx/v14 v14.3.10/go.mod h1:qqyuR0u230jN1yMmE4FIAuCxmahRQEOehF78m6oTgns= +github.com/blevesearch/zapx/v15 v15.3.17 h1:NkkMI98pYLq/uHnB6YWcITrrLpCVyvZ9iP+AyfpW1Ys= +github.com/blevesearch/zapx/v15 v15.3.17/go.mod h1:vXRQzJJvlGVCdmOD5hg7t7JdjUT5DmDPhsAfjvtzIq8= +github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5YgrzCeK3M1QwvZIpxYhChkXp7/L0RhDYsxXoE= +github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI= github.com/chai2010/webp v1.1.1 h1:jTRmEccAJ4MGrhFOrPMpNGIJ/eybIgwKpcACsrTEapk= github.com/chai2010/webp v1.1.1/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I= +github.com/golang/geo v0.0.0-20230421003525-6adc56603217/go.mod h1:8wI0hitZ3a1IxZfeH3/5I97CI8i5cLGsYe7xNhQGs9U= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= +github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI= github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= +go.etcd.io/bbolt v1.3.11 h1:yGEzV1wPz2yVCLsD8ZAiGHhHVlczyC9d1rP43/VCRJ0= +go.etcd.io/bbolt v1.3.11/go.mod h1:dksAq7YMXoljX0xu6VF5DMZGbhYYoLUalEiSySYAS4I= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/image v0.21.0 h1:c5qV36ajHpdj4Qi0GnE0jUc/yuo33OLFaa0d+crTD5s= golang.org/x/image v0.21.0/go.mod h1:vUbsLavqK/W303ZroQQVKQ+Af3Yl6Uz1Ppu5J/cLz78= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= -golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= -golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -42,23 +124,42 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= -golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.36.0 h1:mjIs9gYtt56AzC4ZaffQuh88TZurBGhIJMBZGSxNerQ= +google.golang.org/protobuf v1.36.0/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/indexer.go b/indexer.go new file mode 100644 index 0000000..66bc100 --- /dev/null +++ b/indexer.go @@ -0,0 +1,198 @@ +package main + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/blevesearch/bleve/v2" +) + +// Document represents a single document to be indexed. +// You can add more fields if needed. +type Document struct { + ID string `json:"id"` + Link string `json:"link"` + Title string `json:"title"` + Tags string `json:"tags"` + Description string `json:"description"` + Popularity int64 `json:"popularity"` +} + +var ( + // Global Bleve index handle + bleveIndex bleve.Index +) + +func startPeriodicIndexing(filePath string, interval time.Duration) { + go func() { + for { + printDebug("Refreshing index from %s", filePath) + err := IndexFile(filePath) + if err != nil { + printErr("Failed to refresh index: %v", err) + } + time.Sleep(interval) + } + }() +} + +// InitIndex ensures that the Bleve index is created or opened. +func InitIndex() error { + idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve")) + if err == bleve.ErrorIndexPathDoesNotExist { + // Index doesn't exist, create a new one + mapping := bleve.NewIndexMapping() + + // Custom mapping for the document + docMapping := bleve.NewDocumentMapping() + + // Text fields with custom analyzers for better tokenization + textFieldMapping := bleve.NewTextFieldMapping() + textFieldMapping.Analyzer = "standard" // Use standard analyzer for partial and fuzzy matches + + docMapping.AddFieldMappingsAt("title", textFieldMapping) + docMapping.AddFieldMappingsAt("description", textFieldMapping) + docMapping.AddFieldMappingsAt("tags", textFieldMapping) + + // Numeric field for popularity + popularityMapping := bleve.NewNumericFieldMapping() + docMapping.AddFieldMappingsAt("popularity", popularityMapping) + + mapping.AddDocumentMapping("Document", docMapping) + + idx, err = bleve.New(filepath.Join(config.DriveCache.Path, "index.bleve"), mapping) + if err != nil { + return fmt.Errorf("failed to create index: %v", err) + } + } else if err != nil { + return fmt.Errorf("failed to open index: %v", err) + } + + bleveIndex = idx + return nil +} + +// IndexFile reads a file line-by-line and indexes each line as a document. +// Each line represents a simple document. Adjust parsing as needed. +func IndexFile(filePath string) error { + file, err := os.Open(filePath) + if err != nil { + return fmt.Errorf("unable to open file for indexing: %v", err) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + batch := bleveIndex.NewBatch() + indexedDomains := make(map[string]bool) // Track indexed domains + + for scanner.Scan() { + line := scanner.Text() + + // Split the line into 5 fields: link|title|tags|description|popularity + parts := strings.SplitN(line, "|", 5) + if len(parts) < 5 { + continue // Skip malformed lines + } + + domain := parts[0] + popularity, _ := strconv.ParseInt(parts[4], 10, 64) + + // Skip if the domain is already indexed + if indexedDomains[domain] { + continue + } + + doc := Document{ + ID: domain, // Use the domain as the unique ID + Link: parts[0], + Title: parts[1], + Tags: parts[2], + Description: parts[3], + Popularity: popularity, + } + + err := batch.Index(doc.ID, map[string]interface{}{ + "title": doc.Title, + "description": doc.Description, + "link": doc.Link, + "tags": doc.Tags, + "popularity": doc.Popularity, + }) + if err != nil { + return fmt.Errorf("failed to index document: %v", err) + } + + indexedDomains[domain] = true // Mark the domain as indexed + } + + // Commit the batch + if err := bleveIndex.Batch(batch); err != nil { + return fmt.Errorf("error committing batch: %v", err) + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("error reading file: %v", err) + } + + printDebug("Indexed %d unique domains from %s\n", len(indexedDomains), filePath) + return nil +} + +// SearchIndex performs a full-text search on the indexed data. +func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) { + // Create compound query + exactMatch := bleve.NewMatchQuery(queryStr) // Exact match + fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match + fuzzyMatch.Fuzziness = 2 + prefixMatch := bleve.NewPrefixQuery(queryStr) // Prefix match + + query := bleve.NewDisjunctionQuery(exactMatch, fuzzyMatch, prefixMatch) + + req := bleve.NewSearchRequest(query) + req.Fields = []string{"title", "description", "link", "tags", "popularity"} + + // Pagination + req.Size = pageSize + req.From = (page - 1) * pageSize + + // Sort by popularity + req.SortBy([]string{"popularity"}) + + res, err := bleveIndex.Search(req) + if err != nil { + return nil, fmt.Errorf("search error: %v", err) + } + + var docs []Document + for _, hit := range res.Hits { + title := fmt.Sprintf("%v", hit.Fields["title"]) + description := fmt.Sprintf("%v", hit.Fields["description"]) + link := fmt.Sprintf("%v", hit.Fields["link"]) + tags := fmt.Sprintf("%v", hit.Fields["tags"]) + popularity := int64(0) + + if pop, ok := hit.Fields["popularity"].(float64); ok { + popularity = int64(pop) + } + + if link == "<nil>" || title == "<nil>" { + continue + } + + docs = append(docs, Document{ + ID: hit.ID, + Title: title, + Description: description, + Link: link, + Tags: tags, + Popularity: popularity, + }) + } + + return docs, nil +} diff --git a/init.go b/init.go index e7d4ed1..c92e656 100644 --- a/init.go +++ b/init.go @@ -3,6 +3,8 @@ package main import ( "flag" "os" + "path/filepath" + "time" ) var config Config @@ -95,5 +97,28 @@ func main() { printInfo("RAM cache is disabled.") } + // Init indexer + if config.IndexerEnabled { + if err := downloadAndSetupDomainsCSV(); err != nil { + printErr("Failed to set up domains.csv: %v", err) + return + } + + webCrawlerInit() + + err := InitIndex() + if err != nil { + printErr("Failed to initialize index:", err) + } + + // Start periodic indexing (every 2 minutes) + dataFilePath := filepath.Join(config.DriveCache.Path, "data_to_index.txt") + startPeriodicIndexing(dataFilePath, 2*time.Minute) + + printInfo("Indexer is enabled.") + } else { + printInfo("Indexer is disabled.") + } + runServer() } diff --git a/text.go b/text.go index 4744a97..d6e3212 100755 --- a/text.go +++ b/text.go @@ -73,14 +73,10 @@ func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, select { case results := <-cacheChan: if results == nil { - // Fetch only if the cache miss occurs and Crawler is enabled - if config.CrawlerEnabled { - combinedResults = fetchTextResults(query, safe, lang, page) - if len(combinedResults) > 0 { - resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) - } - } else { - printInfo("Crawler disabled; skipping fetching.") + // Always attempt to fetch results on a cache miss + combinedResults = fetchTextResults(query, safe, lang, page) + if len(combinedResults) > 0 { + resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) } } else { textResults, _, _, _ := convertToSpecificResults(results) @@ -88,13 +84,10 @@ func getTextResultsFromCacheOrFetch(cacheKey CacheKey, query, safe, lang string, } case <-time.After(2 * time.Second): printInfo("Cache check timeout") - if config.CrawlerEnabled { - combinedResults = fetchTextResults(query, safe, lang, page) - if len(combinedResults) > 0 { - resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) - } - } else { - printInfo("Crawler disabled; skipping fetching.") + // Even on timeout, attempt to fetch results + combinedResults = fetchTextResults(query, safe, lang, page) + if len(combinedResults) > 0 { + resultsCache.Set(cacheKey, convertToSearchResults(combinedResults)) } } @@ -121,54 +114,70 @@ func prefetchPage(query, safe, lang string, page int) { func fetchTextResults(query, safe, lang string, page int) []TextSearchResult { var results []TextSearchResult - // If Crawler is disabled, do not fetch from search engines if !config.CrawlerEnabled { - printDebug("Crawler is disabled; skipping search engine fetching.") - return results // Return an empty list - } + printDebug("Crawler is disabled; fetching from local index.") - engineCount := len(textSearchEngines) + // Calculate the starting position based on the page number + indexedResults, err := SearchIndex(query, page, 10) + if err != nil { + printErr("Error searching the index: %v", err) + return results // Return empty results on error + } - // Determine which engine to use for the current page - engineIndex := (page - 1) % engineCount - engine := textSearchEngines[engineIndex] + // Convert indexed results to TextSearchResult format + for _, doc := range indexedResults { + results = append(results, TextSearchResult{ + URL: doc.Link, + Header: doc.Title, + Description: doc.Description, + Source: doc.Tags, + }) + } - // Calculate the page number for this engine - enginePage := (page-1)/engineCount + 1 - - // Debug print to verify engine and page number being fetched - printDebug("Fetching results for overall page %d using engine: %s (engine page %d)", page, engine.Name, enginePage) - - // Fetch results from the selected engine - searchResults, _, err := engine.Func(query, safe, lang, enginePage) - if err != nil { - printWarn("Error performing search with %s: %v", engine.Name, err) + return results } else { - results = append(results, validateResults(searchResults)...) - } + // Crawler is enabled, so use the search engines + engineCount := len(textSearchEngines) - // If no results are found with the selected engine, try the next in line - if len(results) == 0 { - for i := 1; i < engineCount; i++ { - nextEngine := textSearchEngines[(engineIndex+i)%engineCount] - enginePage = (page-1)/engineCount + 1 // Recalculate for the new engine - printInfo("No results found, trying next engine: %s (engine page %d)", nextEngine.Name, enginePage) + // Determine which engine to use for the current page + engineIndex := (page - 1) % engineCount + engine := textSearchEngines[engineIndex] - searchResults, _, err := nextEngine.Func(query, safe, lang, enginePage) - if err != nil { - printWarn("Error performing search with %s: %v", nextEngine.Name, err) - continue - } + // Calculate the page number for this engine + enginePage := (page-1)/engineCount + 1 + + printDebug("Fetching results for overall page %d using engine: %s (engine page %d)", page, engine.Name, enginePage) + + // Fetch results from the selected engine + searchResults, _, err := engine.Func(query, safe, lang, enginePage) + if err != nil { + printWarn("Error performing search with %s: %v", engine.Name, err) + } else { results = append(results, validateResults(searchResults)...) - if len(results) > 0 { - break + } + + // If no results are found with the selected engine, try the next in line + if len(results) == 0 { + for i := 1; i < engineCount; i++ { + nextEngine := textSearchEngines[(engineIndex+i)%engineCount] + enginePage = (page-1)/engineCount + 1 + printInfo("No results found, trying next engine: %s (engine page %d)", nextEngine.Name, enginePage) + + searchResults, _, err := nextEngine.Func(query, safe, lang, enginePage) + if err != nil { + printWarn("Error performing search with %s: %v", nextEngine.Name, err) + continue + } + results = append(results, validateResults(searchResults)...) + if len(results) > 0 { + break + } } } + + printInfo("Fetched %d results for overall page %d", len(results), page) + return results } - - printInfo("Fetched %d results for overall page %d", len(results), page) - - return results } func validateResults(searchResults []SearchResult) []TextSearchResult { From 13e1d6119b2260a762237c1db7a6f528d484bd73 Mon Sep 17 00:00:00 2001 From: partisan <none@noone.no> Date: Mon, 30 Dec 2024 17:19:20 +0100 Subject: [PATCH 2/9] added more config values for indexing + fixed value handling when Its missing in config file --- config.go | 157 +++++++++++++++++++++++++++++++++++------------------ crawler.go | 84 +++++++++++++++------------- indexer.go | 6 +- 3 files changed, 153 insertions(+), 94 deletions(-) diff --git a/config.go b/config.go index 2e5d805..4ea4eb2 100644 --- a/config.go +++ b/config.go @@ -23,35 +23,43 @@ type CacheConfig struct { } type Config struct { - Port int // Added - AuthCode string // Added - PeerID string // Added - Peers []string - Domain string // Added - NodesEnabled bool // Added - CrawlerEnabled bool // Added - IndexerEnabled bool // Added - WebsiteEnabled bool // Added - RamCacheEnabled bool - DriveCacheEnabled bool // Added - LogLevel int // Added + Port int // Added + AuthCode string // Added + PeerID string // Added + Peers []string + Domain string // Added + NodesEnabled bool // Added + CrawlerEnabled bool // Added + IndexerEnabled bool // Added + WebsiteEnabled bool // Added + RamCacheEnabled bool + DriveCacheEnabled bool // Added + LogLevel int // Added + ConcurrentCrawlers int // Number of concurrent crawlers + CrawlingInterval time.Duration // Refres crawled results in... + MaxPagesPerDomain int // Max pages to crawl per domain + IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") DriveCache CacheConfig RamCache CacheConfig } var defaultConfig = Config{ - Port: 5000, - Domain: "localhost", - Peers: []string{}, - AuthCode: generateStrongRandomString(64), - NodesEnabled: false, - CrawlerEnabled: true, - IndexerEnabled: false, - WebsiteEnabled: true, - RamCacheEnabled: true, - DriveCacheEnabled: false, - LogLevel: 1, + Port: 5000, + Domain: "localhost", + Peers: []string{}, + AuthCode: generateStrongRandomString(64), + NodesEnabled: false, + CrawlerEnabled: true, + IndexerEnabled: false, + WebsiteEnabled: true, + RamCacheEnabled: true, + DriveCacheEnabled: false, + ConcurrentCrawlers: 5, + CrawlingInterval: 24 * time.Hour, + MaxPagesPerDomain: 10, + IndexRefreshInterval: 2 * time.Minute, + LogLevel: 1, DriveCache: CacheConfig{ Duration: 48 * time.Hour, // Added Path: "./cache", // Added @@ -238,8 +246,13 @@ func saveConfig(config Config) { featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled)) featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled)) featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled)) - featuresSec.Key("RamCache").SetValue(strconv.FormatBool(config.RamCacheEnabled)) - featuresSec.Key("DriveCache").SetValue(strconv.FormatBool(config.DriveCacheEnabled)) + + // Indexer section + indexerSec := cfg.Section("Indexer") + indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers)) + indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) + indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) + indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String()) // DriveCache section driveSec := cfg.Section("DriveCache") @@ -266,53 +279,61 @@ func loadConfig() Config { } // Server - port, _ := cfg.Section("Server").Key("Port").Int() - domain := cfg.Section("Server").Key("Domain").String() - logLevel, _ := cfg.Section("Server").Key("LogLevel").Int() + port := getConfigValue(cfg.Section("Server").Key("Port"), defaultConfig.Port, strconv.Atoi) + domain := getConfigValueString(cfg.Section("Server").Key("Domain"), defaultConfig.Domain) + logLevel := getConfigValue(cfg.Section("Server").Key("LogLevel"), defaultConfig.LogLevel, strconv.Atoi) // Peers - authCode := cfg.Section("Peers").Key("AuthCode").String() - peersStr := cfg.Section("Peers").Key("Peers").String() - peers := strings.Split(peersStr, ",") + authCode := getConfigValueString(cfg.Section("Peers").Key("AuthCode"), defaultConfig.AuthCode) + peers := strings.Split(getConfigValueString(cfg.Section("Peers").Key("Peers"), ""), ",") // Features - nodesEnabled, _ := cfg.Section("Features").Key("Nodes").Bool() - crawlerEnabled, _ := cfg.Section("Features").Key("Crawler").Bool() - indexerEnabled, _ := cfg.Section("Features").Key("Indexer").Bool() - websiteEnabled, _ := cfg.Section("Features").Key("Website").Bool() - ramCacheEnabled, _ := cfg.Section("Features").Key("RamCache").Bool() - driveCacheEnabled, _ := cfg.Section("Features").Key("DriveCache").Bool() + nodesEnabled := getConfigValueBool(cfg.Section("Features").Key("Nodes"), defaultConfig.NodesEnabled) + crawlerEnabled := getConfigValueBool(cfg.Section("Features").Key("Crawler"), defaultConfig.CrawlerEnabled) + indexerEnabled := getConfigValueBool(cfg.Section("Features").Key("Indexer"), defaultConfig.IndexerEnabled) + websiteEnabled := getConfigValueBool(cfg.Section("Features").Key("Website"), defaultConfig.WebsiteEnabled) + ramCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("RamCache"), defaultConfig.RamCacheEnabled) + driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled) + + // Indexing + concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi) + crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) + maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) + indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration) // DriveCache - driveDuration, _ := time.ParseDuration(cfg.Section("DriveCache").Key("Duration").String()) - drivePath := cfg.Section("DriveCache").Key("Path").String() - driveMaxUsage := parseMaxUsageDrive(cfg.Section("DriveCache").Key("MaxUsage").String(), drivePath) + driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration) + drivePath := getConfigValueString(cfg.Section("DriveCache").Key("Path"), defaultConfig.DriveCache.Path) + driveMaxUsage := parseMaxUsageDrive(getConfigValueString(cfg.Section("DriveCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.DriveCache.MaxUsageBytes)), drivePath) // maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int() // if maxConcurrentDownloads == 0 { // maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads // } // RamCache - ramDuration, _ := time.ParseDuration(cfg.Section("RamCache").Key("Duration").String()) - ramMaxUsage := parseMaxUsageRam(cfg.Section("RamCache").Key("MaxUsage").String()) + ramDuration := getConfigValue(cfg.Section("RamCache").Key("Duration"), defaultConfig.RamCache.Duration, time.ParseDuration) + ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes))) return Config{ - Port: port, - Domain: domain, - LogLevel: logLevel, - AuthCode: authCode, - Peers: peers, - NodesEnabled: nodesEnabled, - CrawlerEnabled: crawlerEnabled, - IndexerEnabled: indexerEnabled, - WebsiteEnabled: websiteEnabled, - RamCacheEnabled: ramCacheEnabled, - DriveCacheEnabled: driveCacheEnabled, + Port: port, + Domain: domain, + LogLevel: logLevel, + AuthCode: authCode, + Peers: peers, + NodesEnabled: nodesEnabled, + CrawlerEnabled: crawlerEnabled, + IndexerEnabled: indexerEnabled, + WebsiteEnabled: websiteEnabled, + RamCacheEnabled: ramCacheEnabled, + DriveCacheEnabled: driveCacheEnabled, + ConcurrentCrawlers: concurrentCrawlers, + CrawlingInterval: crawlingInterval, + MaxPagesPerDomain: maxPagesPerDomain, + IndexRefreshInterval: indexRefreshInterval, DriveCache: CacheConfig{ Duration: driveDuration, MaxUsageBytes: driveMaxUsage, Path: drivePath, - // MaxConcurrentThumbnailDownloads: maxConcurrentDownloads, }, RamCache: CacheConfig{ Duration: ramDuration, @@ -321,6 +342,34 @@ func loadConfig() Config { } } +// getConfigValue retrieves a configuration value or returns a default value from defaultConfig. +func getConfigValue[T any](key *ini.Key, defaultValue T, parseFunc func(string) (T, error)) T { + if key == nil || key.String() == "" { + return defaultValue + } + value, err := parseFunc(key.String()) + if err != nil { + return defaultValue + } + return value +} + +// getConfigValueString retrieves a string value or falls back to the default. +func getConfigValueString(key *ini.Key, defaultValue string) string { + if key == nil || key.String() == "" { + return defaultValue + } + return key.String() +} + +// getConfigValueBool retrieves a boolean value or falls back to the default. +func getConfigValueBool(key *ini.Key, defaultValue bool) bool { + if key == nil || key.String() == "" { + return defaultValue + } + return key.MustBool(defaultValue) +} + // Helper to parse MaxUsage string into bytes func parseMaxUsageRam(value string) uint64 { const GiB = 1024 * 1024 * 1024 diff --git a/crawler.go b/crawler.go index bbe3540..fbb5b5e 100644 --- a/crawler.go +++ b/crawler.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "strings" + "sync" "time" "golang.org/x/net/html" @@ -18,8 +19,8 @@ func webCrawlerInit() { // First run immediately runCrawlerAndIndexer() - // Then every 24h (adjust as needed) - ticker := time.NewTicker(24 * time.Hour) + // Then run periodically based on CrawlingInterval + ticker := time.NewTicker(config.CrawlingInterval) for range ticker.C { runCrawlerAndIndexer() } @@ -37,16 +38,13 @@ func runCrawlerAndIndexer() { // 2. Crawl each domain and write results to data_to_index.txt outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") - if err := crawlDomainsToFile(domains, outFile); err != nil { + if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil { printErr("Error crawling domains: %v", err) return } - // 3. Re-index data_to_index.txt - if err := IndexFile(outFile); err != nil { - printErr("Error indexing data_to_index.txt: %v", err) - return - } + // 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval + startPeriodicIndexing(outFile, config.IndexRefreshInterval) printDebug("Crawl + index refresh completed.") } @@ -81,10 +79,11 @@ func readDomainsCSV(csvPath string) ([][2]string, error) { } // crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile -func crawlDomainsToFile(domains [][2]string, outFile string) error { - // Read existing data_to_index.txt into a map to prevent duplicates +func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error { existingEntries := make(map[string]bool) - if _, err := os.Stat(outFile); err == nil { // File exists + var mu sync.Mutex // Mutex to protect access to the map + + if _, err := os.Stat(outFile); err == nil { file, err := os.Open(outFile) if err != nil { return fmt.Errorf("unable to open %s: %v", outFile, err) @@ -96,7 +95,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error { line := scanner.Text() parts := strings.SplitN(line, "|", 5) if len(parts) >= 1 { - existingEntries[parts[0]] = true // Mark existing domain + existingEntries[parts[0]] = true } } } @@ -108,37 +107,48 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error { } defer file.Close() + semaphore := make(chan struct{}, concurrentCrawlers) + var wg sync.WaitGroup + for _, d := range domains { - rank := d[0] - domain := d[1] - if domain == "" || existingEntries["https://"+domain] { - continue - } + wg.Add(1) + semaphore <- struct{}{} + go func(domain [2]string) { + defer wg.Done() + defer func() { <-semaphore }() - fullURL := "https://" + domain - title, desc, keywords := fetchPageMetadata(fullURL) - if title == "" { - title = "Unknown Title" - } - if desc == "" { - desc = "No Description" - } + rank := domain[0] + domainName := domain[1] + fullURL := "https://" + domainName - // Write unique domain to file - line := fmt.Sprintf("%s|%s|%s|%s|%s\n", - fullURL, - sanitize(title), - sanitize(keywords), - sanitize(desc), - rank, - ) - if _, err := file.WriteString(line); err != nil { - return err - } + mu.Lock() + if domainName == "" || existingEntries[fullURL] { + mu.Unlock() + return + } + existingEntries[fullURL] = true + mu.Unlock() - existingEntries[fullURL] = true + title, desc, keywords := fetchPageMetadata(fullURL) + if title == "" { + title = "Unknown Title" + } + if desc == "" { + desc = "No Description" + } + + line := fmt.Sprintf("%s|%s|%s|%s|%s\n", + fullURL, + sanitize(title), + sanitize(keywords), + sanitize(desc), + rank, + ) + file.WriteString(line) + }(d) } + wg.Wait() return nil } diff --git a/indexer.go b/indexer.go index 66bc100..7963fc1 100644 --- a/indexer.go +++ b/indexer.go @@ -28,12 +28,12 @@ var ( bleveIndex bleve.Index ) +// startPeriodicIndexing refreshes the index from a file periodically func startPeriodicIndexing(filePath string, interval time.Duration) { go func() { for { printDebug("Refreshing index from %s", filePath) - err := IndexFile(filePath) - if err != nil { + if err := IndexFile(filePath); err != nil { printErr("Failed to refresh index: %v", err) } time.Sleep(interval) @@ -139,7 +139,7 @@ func IndexFile(filePath string) error { return fmt.Errorf("error reading file: %v", err) } - printDebug("Indexed %d unique domains from %s\n", len(indexedDomains), filePath) + printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath) return nil } From a9a6948a44254008b03c7b1fd869c370e7541f36 Mon Sep 17 00:00:00 2001 From: partisan <none@noone.no> Date: Tue, 31 Dec 2024 02:44:14 +0100 Subject: [PATCH 3/9] updated indexing & user agent generator --- agent.go | 43 ++++++++++++++++++++++++++------------ indexer.go | 60 ++++++++++++++++++++++++++++++++++-------------------- init.go | 5 +++++ 3 files changed, 73 insertions(+), 35 deletions(-) diff --git a/agent.go b/agent.go index 296b4e4..6333102 100755 --- a/agent.go +++ b/agent.go @@ -3,7 +3,7 @@ package main import ( "encoding/json" "fmt" - "io/ioutil" + "io" "math/rand" "net/http" "sort" @@ -40,13 +40,33 @@ var ( func fetchLatestBrowserVersions() (BrowserData, error) { url := "https://raw.githubusercontent.com/Fyrd/caniuse/master/fulldata-json/data-2.0.json" - resp, err := http.Get(url) + // // Optional: skip TLS verification to avoid certificate errors + // transport := &http.Transport{ + // TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + // } + + // Increase the HTTP client timeout + client := &http.Client{ + Timeout: 30 * time.Second, + // Transport: transport, + } + + // Build the request manually to set headers + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return BrowserData{}, err + } + // Custom user agent and English language preference + req.Header.Set("User-Agent", "MyCustomAgent/1.0 (compatible; +https://example.com)") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + + resp, err := client.Do(req) if err != nil { return BrowserData{}, err } defer resp.Body.Close() - body, err := ioutil.ReadAll(resp.Body) + body, err := io.ReadAll(resp.Body) if err != nil { return BrowserData{}, err } @@ -109,7 +129,7 @@ func randomUserAgent() (string, error) { return "", err } - rand.Seed(time.Now().UnixNano()) + rand := rand.New(rand.NewSource(time.Now().UnixNano())) // Simulated browser usage statistics (in percentages) usageStats := map[string]float64{ @@ -161,6 +181,7 @@ func randomUserAgent() (string, error) { } } + // Fallback to the last version if none matched if version == "" { version = versions[len(versions)-1].Version } @@ -240,11 +261,11 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string { browserType = "Firefox" } - // Get the latest version for the browser type + // Get the latest version for that browser var latestVersion string - if browserType == "Firefox" { + if browserType == "Firefox" && len(newVersions.Firefox) > 0 { latestVersion = newVersions.Firefox[0].Version - } else if browserType == "Chromium" { + } else if browserType == "Chromium" && len(newVersions.Chromium) > 0 { latestVersion = newVersions.Chromium[0].Version } @@ -252,7 +273,7 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string { return generateUserAgent(browserType, latestVersion) } -func periodicUpdate() { +func periodicAgentUpdate() { for { // Sleep for a random interval between 1 and 2 days time.Sleep(time.Duration(24+rand.Intn(24)) * time.Hour) @@ -309,12 +330,8 @@ func GetNewUserAgent(cacheKey string) (string, error) { return userAgent, nil } -func init() { - go periodicUpdate() -} - // func main() { -// go periodicUpdate() // not needed here +// go periodicAgentUpdate() // not needed here // cacheKey := "image-search" // userAgent, err := GetUserAgent(cacheKey) diff --git a/indexer.go b/indexer.go index 7963fc1..306c28d 100644 --- a/indexer.go +++ b/indexer.go @@ -3,6 +3,7 @@ package main import ( "bufio" "fmt" + "net/url" "os" "path/filepath" "strconv" @@ -10,10 +11,10 @@ import ( "time" "github.com/blevesearch/bleve/v2" + "golang.org/x/net/publicsuffix" ) // Document represents a single document to be indexed. -// You can add more fields if needed. type Document struct { ID string `json:"id"` Link string `json:"link"` @@ -48,16 +49,20 @@ func InitIndex() error { // Index doesn't exist, create a new one mapping := bleve.NewIndexMapping() - // Custom mapping for the document docMapping := bleve.NewDocumentMapping() - // Text fields with custom analyzers for better tokenization - textFieldMapping := bleve.NewTextFieldMapping() - textFieldMapping.Analyzer = "standard" // Use standard analyzer for partial and fuzzy matches + // Text fields + titleFieldMapping := bleve.NewTextFieldMapping() + titleFieldMapping.Analyzer = "standard" + docMapping.AddFieldMappingsAt("title", titleFieldMapping) - docMapping.AddFieldMappingsAt("title", textFieldMapping) - docMapping.AddFieldMappingsAt("description", textFieldMapping) - docMapping.AddFieldMappingsAt("tags", textFieldMapping) + descFieldMapping := bleve.NewTextFieldMapping() + descFieldMapping.Analyzer = "standard" + docMapping.AddFieldMappingsAt("description", descFieldMapping) + + tagFieldMapping := bleve.NewTextFieldMapping() + tagFieldMapping.Analyzer = "standard" + docMapping.AddFieldMappingsAt("tags", tagFieldMapping) // Numeric field for popularity popularityMapping := bleve.NewNumericFieldMapping() @@ -77,8 +82,19 @@ func InitIndex() error { return nil } +func normalizeDomain(rawURL string) string { + parsed, err := url.Parse(rawURL) + if err != nil { + return rawURL + } + domain, err := publicsuffix.EffectiveTLDPlusOne(parsed.Hostname()) + if err != nil { + return parsed.Hostname() // fallback + } + return domain +} + // IndexFile reads a file line-by-line and indexes each line as a document. -// Each line represents a simple document. Adjust parsing as needed. func IndexFile(filePath string) error { file, err := os.Open(filePath) if err != nil { @@ -88,27 +104,29 @@ func IndexFile(filePath string) error { scanner := bufio.NewScanner(file) batch := bleveIndex.NewBatch() - indexedDomains := make(map[string]bool) // Track indexed domains + + // Map to track normalized domains we’ve already indexed + indexedDomains := make(map[string]bool) for scanner.Scan() { line := scanner.Text() - // Split the line into 5 fields: link|title|tags|description|popularity + // link|title|tags|description|popularity parts := strings.SplitN(line, "|", 5) if len(parts) < 5 { - continue // Skip malformed lines + continue } - domain := parts[0] + // Normalize domain part so duplicates share the same “key” + normalized := normalizeDomain(parts[0]) popularity, _ := strconv.ParseInt(parts[4], 10, 64) - // Skip if the domain is already indexed - if indexedDomains[domain] { + if indexedDomains[normalized] { continue } doc := Document{ - ID: domain, // Use the domain as the unique ID + ID: normalized, Link: parts[0], Title: parts[1], Tags: parts[2], @@ -127,10 +145,9 @@ func IndexFile(filePath string) error { return fmt.Errorf("failed to index document: %v", err) } - indexedDomains[domain] = true // Mark the domain as indexed + indexedDomains[normalized] = true } - // Commit the batch if err := bleveIndex.Batch(batch); err != nil { return fmt.Errorf("error committing batch: %v", err) } @@ -139,13 +156,12 @@ func IndexFile(filePath string) error { return fmt.Errorf("error reading file: %v", err) } - printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath) + printDebug("Indexed %d unique normalized domains from %s", len(indexedDomains), filePath) return nil } // SearchIndex performs a full-text search on the indexed data. func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) { - // Create compound query exactMatch := bleve.NewMatchQuery(queryStr) // Exact match fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match fuzzyMatch.Fuzziness = 2 @@ -160,8 +176,8 @@ func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) { req.Size = pageSize req.From = (page - 1) * pageSize - // Sort by popularity - req.SortBy([]string{"popularity"}) + // Sort primarily by relevance (score), then by popularity descending + req.SortBy([]string{"-_score", "-popularity"}) res, err := bleveIndex.Search(req) if err != nil { diff --git a/init.go b/init.go index c92e656..7a6dba2 100644 --- a/init.go +++ b/init.go @@ -61,6 +61,11 @@ func main() { } config.PeerID = hostID + // Initiate Browser Agent updater + if config.CrawlerEnabled || config.IndexerEnabled { + go periodicAgentUpdate() + } + InitializeLanguage("en") // Initialize language before generating OpenSearch generateOpenSearchXML(config) From 3494457336f4d8b0f5138b0057717d7b8af81c9d Mon Sep 17 00:00:00 2001 From: partisan <none@noone.no> Date: Wed, 1 Jan 2025 13:49:16 +0100 Subject: [PATCH 4/9] improved crawler data extraction --- crawler-extraction.go | 204 ++++++++++++++++++++++++++++++++++++++++++ crawler.go | 100 ++------------------- go.mod | 5 ++ go.sum | 14 +++ 4 files changed, 231 insertions(+), 92 deletions(-) create mode 100644 crawler-extraction.go diff --git a/crawler-extraction.go b/crawler-extraction.go new file mode 100644 index 0000000..1594bef --- /dev/null +++ b/crawler-extraction.go @@ -0,0 +1,204 @@ +package main + +import ( + "net/http" + "net/url" + "strings" + "time" + + "github.com/go-shiori/go-readability" + "golang.org/x/net/html" +) + +// fetchPageMetadata tries extracting title/description/keywords from standard HTML, +// OG, Twitter, then falls back to go-readability if needed. If after all that we +// still have no title or no description, we return ("", "", "") so the caller +// can skip saving it. +// +// 1. <title>, <meta name="description"/>, <meta name="keywords"/> +// 2. <meta property="og:title">, <meta property="og:description"> +// 3. <meta name="twitter:title">, <meta name="twitter:description"> +// 4. go-readability fallback (if title or description is still missing) +// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”). +func fetchPageMetadata(pageURL string) (string, string, string) { + userAgent, err := GetUserAgent("crawler") + if err != nil { + printDebug("Failed to generate User-Agent: %v", err) + return "", "", "" + } + + client := &http.Client{Timeout: 15 * time.Second} + req, err := http.NewRequest("GET", pageURL, nil) + if err != nil { + printDebug("Failed to create request for %s: %v", pageURL, err) + return "", "", "" + } + + // Force English content when possible + req.Header.Set("User-Agent", userAgent) + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + + resp, err := client.Do(req) + if err != nil { + printDebug("Failed to GET %s: %v", pageURL, err) + return "", "", "" + } + defer resp.Body.Close() + + // Skip non-2xx + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) + return "", "", "" + } + + // First pass: standard HTML parse + doc, err := html.Parse(resp.Body) + if err != nil { + printDebug("HTML parse error for %s: %v", pageURL, err) + return "", "", "" + } + + var ( + title, desc, keywords string + ogTitle, ogDesc string + twTitle, twDesc string + foundTitle, foundDesc bool + ) + + var walk func(*html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode { + switch strings.ToLower(n.Data) { + case "title": + if n.FirstChild != nil { + title = n.FirstChild.Data + foundTitle = true + } + case "meta": + var metaName, metaProperty, contentVal string + for _, attr := range n.Attr { + switch strings.ToLower(attr.Key) { + case "name": + metaName = strings.ToLower(attr.Val) + case "property": + metaProperty = strings.ToLower(attr.Val) + case "content": + contentVal = attr.Val + } + } + + // Standard meta tags + switch metaName { + case "description": + desc = contentVal + foundDesc = true + case "keywords": + keywords = contentVal + case "twitter:title": + twTitle = contentVal + case "twitter:description": + twDesc = contentVal + } + + // Open Graph tags + switch metaProperty { + case "og:title": + ogTitle = contentVal + case "og:description": + ogDesc = contentVal + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + walk(doc) + + // Fallback to OG or Twitter if <title>/description are missing + if !foundTitle { + if ogTitle != "" { + title = ogTitle + } else if twTitle != "" { + title = twTitle + } + } + if !foundDesc { + if ogDesc != "" { + desc = ogDesc + } else if twDesc != "" { + desc = twDesc + } + } + + // If still missing title or desc, fallback to go-readability + if title == "" || desc == "" { + parsedURL, parseErr := url.Parse(pageURL) + if parseErr != nil { + printDebug("Failed to parse URL %s: %v", pageURL, parseErr) + // We must skip if we can't parse the URL for readability + return "", "", "" + } + + readResp, readErr := client.Get(pageURL) + if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 { + defer readResp.Body.Close() + + article, rdErr := readability.FromReader(readResp.Body, parsedURL) + if rdErr == nil { + // If we still have no title, try from readability + if title == "" && article.Title != "" { + title = article.Title + } + // If we still have no description, try article.Excerpt + if desc == "" && article.Excerpt != "" { + desc = article.Excerpt + } else if desc == "" && len(article.Content) > 0 { + // If excerpt is empty, use a snippet from article.Content + snippet := article.Content + if len(snippet) > 200 { + snippet = snippet[:200] + "..." + } + desc = snippet + } + } else { + printDebug("go-readability failed for %s: %v", pageURL, rdErr) + } + } + } + + // Heuristic: discard obviously incorrect HTML-y strings or placeholders + if looksLikeRawHTML(title) { + title = "" + } + if looksLikeRawHTML(desc) { + desc = "" + } + + // If after all that we have no title or description, skip + if title == "" || desc == "" { + return "", "", "" + } + + return sanitize(title), sanitize(desc), sanitize(keywords) +} + +// looksLikeRawHTML is a simple heuristic to check for leftover HTML or +// go-readability noise (e.g., "readability-page-1"). +func looksLikeRawHTML(text string) bool { + textLower := strings.ToLower(text) + if strings.Contains(textLower, "readability-page") { + return true + } + if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 { + return true + } + return false +} + +// sanitize removes pipes and newlines so they don't break our output format. +func sanitize(input string) string { + input = strings.ReplaceAll(input, "|", " ") + input = strings.ReplaceAll(input, "\n", " ") + return strings.TrimSpace(input) +} diff --git a/crawler.go b/crawler.go index fbb5b5e..2a934f6 100644 --- a/crawler.go +++ b/crawler.go @@ -3,14 +3,11 @@ package main import ( "bufio" "fmt" - "net/http" "os" "path/filepath" "strings" "sync" "time" - - "golang.org/x/net/html" ) // webCrawlerInit is called during init on program start @@ -130,18 +127,18 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu mu.Unlock() title, desc, keywords := fetchPageMetadata(fullURL) - if title == "" { - title = "Unknown Title" - } - if desc == "" { - desc = "No Description" + + // Skip saving if title or description is missing + if title == "" || desc == "" { + printDebug("Skipping %s: missing title or description", fullURL) + return } line := fmt.Sprintf("%s|%s|%s|%s|%s\n", fullURL, - sanitize(title), - sanitize(keywords), - sanitize(desc), + title, + keywords, + desc, rank, ) file.WriteString(line) @@ -151,84 +148,3 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu wg.Wait() return nil } - -// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords] -func fetchPageMetadata(pageURL string) (string, string, string) { - // Generate a User-Agent using your GetUserAgent function - userAgent, err := GetUserAgent("crawler") - if err != nil { - printWarn("Failed to generate User-Agent: %v", err) - return "", "", "" - } - - client := &http.Client{Timeout: 15 * time.Second} - req, err := http.NewRequest("GET", pageURL, nil) - if err != nil { - printWarn("Failed to create request for %s: %v", pageURL, err) - return "", "", "" - } - - // Set the dynamically generated User-Agent - req.Header.Set("User-Agent", userAgent) - - resp, err := client.Do(req) - if err != nil { - printWarn("Failed to GET %s: %v", pageURL, err) - return "", "", "" - } - defer resp.Body.Close() - - // Handle non-200 responses - if resp.StatusCode == 403 || resp.StatusCode == 401 { - printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode) - return "", "", "" - } else if resp.StatusCode < 200 || resp.StatusCode >= 300 { - printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode) - return "", "", "" - } - - // Parse HTML - doc, err := html.Parse(resp.Body) - if err != nil { - printWarn("HTML parse error for %s: %v", pageURL, err) - return "", "", "" - } - - var title, desc, keywords string - var f func(*html.Node) - f = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil { - title = n.FirstChild.Data - } - if n.Type == html.ElementNode && n.Data == "meta" { - var nameVal, contentVal string - for _, attr := range n.Attr { - switch strings.ToLower(attr.Key) { - case "name": - nameVal = strings.ToLower(attr.Val) - case "content": - contentVal = attr.Val - } - } - if nameVal == "description" { - desc = contentVal - } else if nameVal == "keywords" { - keywords = contentVal - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } - } - f(doc) - - return title, desc, keywords -} - -// sanitize is a quick helper to remove newlines/pipes from fields -func sanitize(input string) string { - input = strings.ReplaceAll(input, "|", " ") - input = strings.ReplaceAll(input, "\n", " ") - input = strings.TrimSpace(input) - return input -} diff --git a/go.mod b/go.mod index 6895586..a293a75 100644 --- a/go.mod +++ b/go.mod @@ -15,12 +15,14 @@ require ( require ( github.com/blevesearch/bleve/v2 v2.4.4 + github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f golang.org/x/net v0.33.0 ) require ( github.com/RoaringBitmap/roaring v1.9.4 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect github.com/bits-and-blooms/bitset v1.20.0 // indirect github.com/blevesearch/bleve_index_api v1.2.0 // indirect github.com/blevesearch/geo v0.1.20 // indirect @@ -40,6 +42,8 @@ require ( github.com/blevesearch/zapx/v15 v15.3.17 // indirect github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect github.com/go-ole/go-ole v1.3.0 // indirect + github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.4 // indirect @@ -51,5 +55,6 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect go.etcd.io/bbolt v1.3.11 // indirect golang.org/x/sys v0.28.0 // indirect + golang.org/x/text v0.21.0 // indirect google.golang.org/protobuf v1.36.0 // indirect ) diff --git a/go.sum b/go.sum index f3f643b..59414b4 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA= +github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.20.0 h1:2F+rfL86jE2d/bmw7OhqUg2Sj/1rURkBn3MdfoPyRVU= github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= @@ -51,6 +53,12 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= +github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI= +github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I= github.com/golang/geo v0.0.0-20230421003525-6adc56603217/go.mod h1:8wI0hitZ3a1IxZfeH3/5I97CI8i5cLGsYe7xNhQGs9U= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= @@ -64,6 +72,7 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= +github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -73,6 +82,10 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI= github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -147,6 +160,7 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= From c71808aa1e116c8c9238e620f255ee5d8ba3f4bb Mon Sep 17 00:00:00 2001 From: partisan <none@noone.no> Date: Wed, 1 Jan 2025 14:50:12 +0100 Subject: [PATCH 5/9] improved crawler data extraction (added chromedp) --- .gitignore | 3 +- config.go | 101 +++++++++++----------- crawler-extraction.go | 196 ++++++++++++++++++++++++++---------------- crawler.go | 146 ++++++++++++++++++++++--------- go.mod | 8 ++ go.sum | 17 ++++ 6 files changed, 305 insertions(+), 166 deletions(-) diff --git a/.gitignore b/.gitignore index 118b838..5f5aeab 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ image_cache/ cache/ *.min.js *.min.css -qgato \ No newline at end of file +qgato +test.py \ No newline at end of file diff --git a/config.go b/config.go index 4ea4eb2..bdd9ccc 100644 --- a/config.go +++ b/config.go @@ -23,43 +23,45 @@ type CacheConfig struct { } type Config struct { - Port int // Added - AuthCode string // Added - PeerID string // Added - Peers []string - Domain string // Added - NodesEnabled bool // Added - CrawlerEnabled bool // Added - IndexerEnabled bool // Added - WebsiteEnabled bool // Added - RamCacheEnabled bool - DriveCacheEnabled bool // Added - LogLevel int // Added - ConcurrentCrawlers int // Number of concurrent crawlers - CrawlingInterval time.Duration // Refres crawled results in... - MaxPagesPerDomain int // Max pages to crawl per domain - IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") + Port int // Added + AuthCode string // Added + PeerID string // Added + Peers []string + Domain string // Added + NodesEnabled bool // Added + CrawlerEnabled bool // Added + IndexerEnabled bool // Added + WebsiteEnabled bool // Added + RamCacheEnabled bool + DriveCacheEnabled bool // Added + LogLevel int // Added + ConcurrentStandardCrawlers int + ConcurrentChromeCrawlers int + CrawlingInterval time.Duration // Refres crawled results in... + MaxPagesPerDomain int // Max pages to crawl per domain + IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") DriveCache CacheConfig RamCache CacheConfig } var defaultConfig = Config{ - Port: 5000, - Domain: "localhost", - Peers: []string{}, - AuthCode: generateStrongRandomString(64), - NodesEnabled: false, - CrawlerEnabled: true, - IndexerEnabled: false, - WebsiteEnabled: true, - RamCacheEnabled: true, - DriveCacheEnabled: false, - ConcurrentCrawlers: 5, - CrawlingInterval: 24 * time.Hour, - MaxPagesPerDomain: 10, - IndexRefreshInterval: 2 * time.Minute, - LogLevel: 1, + Port: 5000, + Domain: "localhost", + Peers: []string{}, + AuthCode: generateStrongRandomString(64), + NodesEnabled: false, + CrawlerEnabled: true, + IndexerEnabled: false, + WebsiteEnabled: true, + RamCacheEnabled: true, + DriveCacheEnabled: false, + ConcurrentStandardCrawlers: 12, + ConcurrentChromeCrawlers: 4, + CrawlingInterval: 24 * time.Hour, + MaxPagesPerDomain: 10, + IndexRefreshInterval: 2 * time.Minute, + LogLevel: 1, DriveCache: CacheConfig{ Duration: 48 * time.Hour, // Added Path: "./cache", // Added @@ -249,7 +251,8 @@ func saveConfig(config Config) { // Indexer section indexerSec := cfg.Section("Indexer") - indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers)) + indexerSec.Key("ConcurrentStandardCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) + indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String()) @@ -296,7 +299,8 @@ func loadConfig() Config { driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled) // Indexing - concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi) + concurrentStandardCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentStandardCrawlers"), defaultConfig.ConcurrentStandardCrawlers, strconv.Atoi) + concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi) crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration) @@ -315,21 +319,22 @@ func loadConfig() Config { ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes))) return Config{ - Port: port, - Domain: domain, - LogLevel: logLevel, - AuthCode: authCode, - Peers: peers, - NodesEnabled: nodesEnabled, - CrawlerEnabled: crawlerEnabled, - IndexerEnabled: indexerEnabled, - WebsiteEnabled: websiteEnabled, - RamCacheEnabled: ramCacheEnabled, - DriveCacheEnabled: driveCacheEnabled, - ConcurrentCrawlers: concurrentCrawlers, - CrawlingInterval: crawlingInterval, - MaxPagesPerDomain: maxPagesPerDomain, - IndexRefreshInterval: indexRefreshInterval, + Port: port, + Domain: domain, + LogLevel: logLevel, + AuthCode: authCode, + Peers: peers, + NodesEnabled: nodesEnabled, + CrawlerEnabled: crawlerEnabled, + IndexerEnabled: indexerEnabled, + WebsiteEnabled: websiteEnabled, + RamCacheEnabled: ramCacheEnabled, + DriveCacheEnabled: driveCacheEnabled, + ConcurrentStandardCrawlers: concurrentStandardCrawlers, + ConcurrentChromeCrawlers: concurrentChromeCrawlers, + CrawlingInterval: crawlingInterval, + MaxPagesPerDomain: maxPagesPerDomain, + IndexRefreshInterval: indexRefreshInterval, DriveCache: CacheConfig{ Duration: driveDuration, MaxUsageBytes: driveMaxUsage, diff --git a/crawler-extraction.go b/crawler-extraction.go index 1594bef..4ce8b9d 100644 --- a/crawler-extraction.go +++ b/crawler-extraction.go @@ -1,69 +1,99 @@ package main import ( + "context" "net/http" "net/url" "strings" "time" + "github.com/chromedp/cdproto/emulation" + "github.com/chromedp/chromedp" "github.com/go-shiori/go-readability" "golang.org/x/net/html" ) -// fetchPageMetadata tries extracting title/description/keywords from standard HTML, -// OG, Twitter, then falls back to go-readability if needed. If after all that we -// still have no title or no description, we return ("", "", "") so the caller -// can skip saving it. -// -// 1. <title>, <meta name="description"/>, <meta name="keywords"/> -// 2. <meta property="og:title">, <meta property="og:description"> -// 3. <meta name="twitter:title">, <meta name="twitter:description"> -// 4. go-readability fallback (if title or description is still missing) -// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”). -func fetchPageMetadata(pageURL string) (string, string, string) { - userAgent, err := GetUserAgent("crawler") +// fetchPageMetadataStandard tries standard HTML parse + go-readability only. +func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) { + // 1. Standard HTML parse + title, desc, keywords := extractStandard(pageURL, userAgent) + + // 2. Fallback: go-readability + if title == "" || desc == "" { + title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords) + } + + // If still empty, return ("", "", "") + if title == "" || desc == "" { + return "", "", "" + } + return sanitize(title), sanitize(desc), sanitize(keywords) +} + +// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages. +func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) { + // Create context + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + + var renderedHTML string + err := chromedp.Run(ctx, + emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"), + chromedp.Navigate(pageURL), + chromedp.Sleep(2*time.Second), // Let JS run a bit + chromedp.OuterHTML("html", &renderedHTML), + ) if err != nil { - printDebug("Failed to generate User-Agent: %v", err) + printDebug("chromedp error for %s: %v", pageURL, err) return "", "", "" } + doc, err := html.Parse(strings.NewReader(renderedHTML)) + if err != nil { + printDebug("chromedp parse error for %s: %v", pageURL, err) + return "", "", "" + } + + return extractParsedDOM(doc) +} + +// extractStandard does the normal HTML parse with OG, Twitter, etc. +func extractStandard(pageURL, userAgent string) (title, desc, keywords string) { client := &http.Client{Timeout: 15 * time.Second} req, err := http.NewRequest("GET", pageURL, nil) if err != nil { printDebug("Failed to create request for %s: %v", pageURL, err) - return "", "", "" + return } - - // Force English content when possible req.Header.Set("User-Agent", userAgent) req.Header.Set("Accept-Language", "en-US,en;q=0.9") resp, err := client.Do(req) if err != nil { printDebug("Failed to GET %s: %v", pageURL, err) - return "", "", "" + return } defer resp.Body.Close() - // Skip non-2xx if resp.StatusCode < 200 || resp.StatusCode >= 300 { printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) - return "", "", "" + return } - // First pass: standard HTML parse doc, err := html.Parse(resp.Body) if err != nil { printDebug("HTML parse error for %s: %v", pageURL, err) - return "", "", "" + return } - var ( - title, desc, keywords string - ogTitle, ogDesc string - twTitle, twDesc string - foundTitle, foundDesc bool - ) + return extractParsedDOM(doc) +} + +// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter. +func extractParsedDOM(doc *html.Node) (title, desc, keywords string) { + var ogTitle, ogDesc string + var twTitle, twDesc string + var foundTitle, foundDesc bool var walk func(*html.Node) walk = func(n *html.Node) { @@ -87,7 +117,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) { } } - // Standard meta tags switch metaName { case "description": desc = contentVal @@ -100,7 +129,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) { twDesc = contentVal } - // Open Graph tags switch metaProperty { case "og:title": ogTitle = contentVal @@ -115,7 +143,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) { } walk(doc) - // Fallback to OG or Twitter if <title>/description are missing + // fallback to OG/Twitter if missing if !foundTitle { if ogTitle != "" { title = ogTitle @@ -131,43 +159,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) { } } - // If still missing title or desc, fallback to go-readability - if title == "" || desc == "" { - parsedURL, parseErr := url.Parse(pageURL) - if parseErr != nil { - printDebug("Failed to parse URL %s: %v", pageURL, parseErr) - // We must skip if we can't parse the URL for readability - return "", "", "" - } - - readResp, readErr := client.Get(pageURL) - if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 { - defer readResp.Body.Close() - - article, rdErr := readability.FromReader(readResp.Body, parsedURL) - if rdErr == nil { - // If we still have no title, try from readability - if title == "" && article.Title != "" { - title = article.Title - } - // If we still have no description, try article.Excerpt - if desc == "" && article.Excerpt != "" { - desc = article.Excerpt - } else if desc == "" && len(article.Content) > 0 { - // If excerpt is empty, use a snippet from article.Content - snippet := article.Content - if len(snippet) > 200 { - snippet = snippet[:200] + "..." - } - desc = snippet - } - } else { - printDebug("go-readability failed for %s: %v", pageURL, rdErr) - } - } - } - - // Heuristic: discard obviously incorrect HTML-y strings or placeholders + // Heuristic check if looksLikeRawHTML(title) { title = "" } @@ -175,16 +167,68 @@ func fetchPageMetadata(pageURL string) (string, string, string) { desc = "" } - // If after all that we have no title or description, skip - if title == "" || desc == "" { - return "", "", "" - } - - return sanitize(title), sanitize(desc), sanitize(keywords) + return title, desc, keywords } -// looksLikeRawHTML is a simple heuristic to check for leftover HTML or -// go-readability noise (e.g., "readability-page-1"). +// fallbackReadability tries go-readability if title/desc is missing. +func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) { + if title != "" && desc != "" { + return title, desc, keywords + } + + client := &http.Client{Timeout: 15 * time.Second} + readReq, err := http.NewRequest("GET", pageURL, nil) + if err != nil { + printDebug("Failed to create fallbackReadability request: %v", err) + return title, desc, keywords + } + readReq.Header.Set("User-Agent", userAgent) + readReq.Header.Set("Accept-Language", "en-US,en;q=0.9") + + readResp, err := client.Do(readReq) + if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 { + if err != nil { + printDebug("go-readability GET error for %s: %v", pageURL, err) + } + if readResp != nil { + readResp.Body.Close() + } + return title, desc, keywords + } + defer readResp.Body.Close() + + parsedURL, parseErr := url.Parse(pageURL) + if parseErr != nil { + printDebug("Failed to parse URL: %v", parseErr) + return title, desc, keywords + } + + article, rdErr := readability.FromReader(readResp.Body, parsedURL) + if rdErr != nil { + printDebug("go-readability error for %s: %v", pageURL, rdErr) + return title, desc, keywords + } + + if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) { + title = article.Title + } + if desc == "" { + if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) { + desc = article.Excerpt + } else if len(article.Content) > 0 { + snippet := article.Content + if len(snippet) > 200 { + snippet = snippet[:200] + "..." + } + if !looksLikeRawHTML(snippet) { + desc = snippet + } + } + } + return title, desc, keywords +} + +// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text func looksLikeRawHTML(text string) bool { textLower := strings.ToLower(text) if strings.Contains(textLower, "readability-page") { @@ -196,7 +240,7 @@ func looksLikeRawHTML(text string) bool { return false } -// sanitize removes pipes and newlines so they don't break our output format. +// sanitize removes pipes/newlines so they don't break our output format. func sanitize(input string) string { input = strings.ReplaceAll(input, "|", " ") input = strings.ReplaceAll(input, "\n", " ") diff --git a/crawler.go b/crawler.go index 2a934f6..45dc76f 100644 --- a/crawler.go +++ b/crawler.go @@ -35,7 +35,7 @@ func runCrawlerAndIndexer() { // 2. Crawl each domain and write results to data_to_index.txt outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") - if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil { + if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil { printErr("Error crawling domains: %v", err) return } @@ -75,18 +75,20 @@ func readDomainsCSV(csvPath string) ([][2]string, error) { return result, scanner.Err() } -// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile -func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error { +// crawlDomainsToFile does an async pipeline: +// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh +// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip +func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error { existingEntries := make(map[string]bool) - var mu sync.Mutex // Mutex to protect access to the map + var mu sync.Mutex // For existingEntries + file writes + // read existing entries from outFile if it exists if _, err := os.Stat(outFile); err == nil { file, err := os.Open(outFile) if err != nil { return fmt.Errorf("unable to open %s: %v", outFile, err) } defer file.Close() - scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() @@ -104,47 +106,109 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu } defer file.Close() - semaphore := make(chan struct{}, concurrentCrawlers) - var wg sync.WaitGroup + // Prepare channels + standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking + chromeCh := make(chan [2]string, 1000) - for _, d := range domains { - wg.Add(1) - semaphore <- struct{}{} - go func(domain [2]string) { - defer wg.Done() - defer func() { <-semaphore }() + // 1) Spawn standard workers + var wgStandard sync.WaitGroup + for i := 0; i < config.ConcurrentStandardCrawlers; i++ { + wgStandard.Add(1) + go func() { + defer wgStandard.Done() + for dom := range standardCh { + rank := dom[0] + domainName := dom[1] + fullURL := "https://" + domainName - rank := domain[0] - domainName := domain[1] - fullURL := "https://" + domainName - - mu.Lock() - if domainName == "" || existingEntries[fullURL] { + // Mark domain existing so we don't re-crawl duplicates + mu.Lock() + if domainName == "" || existingEntries[fullURL] { + mu.Unlock() + continue + } + existingEntries[fullURL] = true + mu.Unlock() + + // get a standard user agent + userAgent, _ := GetUserAgent("crawler-std") + title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent) + + if title == "" || desc == "" { + // push to chromeCh + chromeCh <- dom + continue + } + + // write to file + line := fmt.Sprintf("%s|%s|%s|%s|%s\n", + fullURL, title, keywords, desc, rank) + + mu.Lock() + file.WriteString(line) mu.Unlock() - return } - existingEntries[fullURL] = true - mu.Unlock() - - title, desc, keywords := fetchPageMetadata(fullURL) - - // Skip saving if title or description is missing - if title == "" || desc == "" { - printDebug("Skipping %s: missing title or description", fullURL) - return - } - - line := fmt.Sprintf("%s|%s|%s|%s|%s\n", - fullURL, - title, - keywords, - desc, - rank, - ) - file.WriteString(line) - }(d) + }() } - wg.Wait() + // 2) Spawn chrome workers + var wgChrome sync.WaitGroup + for i := 0; i < config.ConcurrentChromeCrawlers; i++ { + wgChrome.Add(1) + go func() { + defer wgChrome.Done() + for dom := range chromeCh { + rank := dom[0] + domainName := dom[1] + fullURL := "https://" + domainName + + // Mark domain existing if not already + mu.Lock() + if domainName == "" || existingEntries[fullURL] { + mu.Unlock() + continue + } + existingEntries[fullURL] = true + mu.Unlock() + + // get a chrome user agent + userAgent, _ := GetUserAgent("crawler-chrome") + title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent) + + if title == "" || desc == "" { + printWarn("Skipping (Chrome) %s: missing title/desc", fullURL) + continue + } + + // write to file + line := fmt.Sprintf("%s|%s|%s|%s|%s\n", + fullURL, title, keywords, desc, rank) + + mu.Lock() + file.WriteString(line) + mu.Unlock() + } + }() + } + + // Feed domains into standardCh + go func() { + for _, dom := range domains { + // optionally, if maxPages is relevant, you can track how many have been processed + standardCh <- dom + } + // close the standardCh once all are queued + close(standardCh) + }() + + // Wait for standard workers to finish, then close chromeCh + go func() { + wgStandard.Wait() + close(chromeCh) + }() + + // Wait for chrome workers to finish + wgChrome.Wait() + return nil } diff --git a/go.mod b/go.mod index a293a75..c8200d3 100644 --- a/go.mod +++ b/go.mod @@ -41,13 +41,21 @@ require ( github.com/blevesearch/zapx/v14 v14.3.10 // indirect github.com/blevesearch/zapx/v15 v15.3.17 // indirect github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect + github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb // indirect + github.com/chromedp/chromedp v0.11.2 // indirect + github.com/chromedp/sysutil v1.1.0 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/gobwas/httphead v0.1.0 // indirect + github.com/gobwas/pool v0.2.1 // indirect + github.com/gobwas/ws v1.4.0 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.4 // indirect + github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mschoch/smat v0.2.0 // indirect diff --git a/go.sum b/go.sum index 59414b4..148146f 100644 --- a/go.sum +++ b/go.sum @@ -47,6 +47,12 @@ github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5Y github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI= github.com/chai2010/webp v1.1.1 h1:jTRmEccAJ4MGrhFOrPMpNGIJ/eybIgwKpcACsrTEapk= github.com/chai2010/webp v1.1.1/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU= +github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb h1:noKVm2SsG4v0Yd0lHNtFYc9EUxIVvrr4kJ6hM8wvIYU= +github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb/go.mod h1:4XqMl3iIW08jtieURWL6Tt5924w21pxirC6th662XUM= +github.com/chromedp/chromedp v0.11.2 h1:ZRHTh7DjbNTlfIv3NFTbB7eVeu5XCNkgrpcGSpn2oX0= +github.com/chromedp/chromedp v0.11.2/go.mod h1:lr8dFRLKsdTTWb75C/Ttol2vnBKOSnt0BW8R9Xaupi8= +github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM= +github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -57,6 +63,12 @@ github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziH github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI= github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w= +github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= +github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= +github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= +github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I= @@ -68,10 +80,14 @@ github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= @@ -137,6 +153,7 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= From 918e1823dfc6a20127d18a453ab6cbb734cc2190 Mon Sep 17 00:00:00 2001 From: partisan <none@noone.no> Date: Wed, 1 Jan 2025 23:48:47 +0100 Subject: [PATCH 6/9] added visited sites functionality to crawler --- README.md | 34 +++++++-------- crawler-visited.go | 106 +++++++++++++++++++++++++++++++++++++++++++++ crawler.go | 93 ++++++++++++++++++++------------------- go.mod | 4 +- go.sum | 4 ++ 5 files changed, 178 insertions(+), 63 deletions(-) create mode 100644 crawler-visited.go diff --git a/README.md b/README.md index 23e8bf5..5ad3337 100644 --- a/README.md +++ b/README.md @@ -7,30 +7,30 @@ </p> <p align="center"> -A self-hosted private <a href="https://en.wikipedia.org/wiki/Metasearch_engine">metasearch engine</a> that aims to be more resource-efficient than its competition. +A self-hosted private search engine designed to be scalable and more resource-efficient than its competitors. </p> # Bare in mind that this project is still WIP -## Comparison to other search engines +## Comparison to other open-source search engines -| Feature | Whoogle [1] | Araa-Search | LibreY | 4get | SearchXNG | *QGato* | -| :------------------------- | ------------------ | ------------------------- | ------------------------ | ------------------------ | ------------------------- | ---------------------------------------------------- | -| Works without JavaScript | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Music search | ❓ | ❌ | ❌ | ✅ | ✅ | ✅ | -| Torrent search | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | -| API | ❌ | ❓ [2] | ✅ | ✅ | ✅ | ✅ | -| Scalable | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | -| Not Resource Hungry | ❓ Moderate | ❌ Very resource hungry | ❌ Moderate 200-400mb~ | ❌ Moderate 200-400mb~ | ❌ Moderate 200-300MiB~ | ✅ about 15-20MiB at idle, 17-22MiB when searching | -| Result caching | ❌ | ❌ | ❓ | ❓ | ❓ | ✅ | -| Dynamic Page Loading | ❓ Not specified | ❌ | ❌ | ❌ | ✅ | ✅ | -| User themable | ❌ | ✅ | ❌ | ❌ | ✅[3] | ✅ | -| Unusual logo choice | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| Feature | Whoogle [1] | Araa-Search | LibreY | 4get | SearchXNG | *QGato* | +| :------------------------- | ------------- | ------------------------- | ------------------------ | ------------------------ | ------------------------- | --------------------------------------- | +| Works without JavaScript | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Music search | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | +| Torrent search | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | +| API | ❌ | ❌ [2] | ✅ | ✅ | ✅ | ✅ | +| Scalable | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | +| Not Resource Hungry | ❓ Moderate | ❌ Very resource hungry | ❌ Moderate 200-400mb~ | ❌ Moderate 200-400mb~ | ❌ Moderate 200-300MiB~ | ✅ about 15-30MiB even when searching | +| Result caching | ❓ | ❓ | ❓ | ❓ | ❓ | ✅ | +| Dynamic Page Loading | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | +| User themable | ❌ | ✅ | ❌ | ❌ | ❓[3] | ✅ | +| Unusual logo choice | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | [1]: I was not able to check this since their site does not work, same for the community instances. -[2]: In the project repo they specify that it has API, but It looks like they are no loger supporting it. Or just removed "API" button and documentation, since I was not able to find it anymore. +[2]: In the project repo they specify that it has API, but It looks like they are no longer supporting it. Or just removed "API" button and documentation, since I was not able to find it anymore. [3]: It is called 'User Themable' because you want to give the user freedom of choice for their theme, not by hard-setting one theme in the backend and calling it themable. @@ -48,7 +48,7 @@ A self-hosted private <a href="https://en.wikipedia.org/wiki/Metasearch_engine"> ### For Self-Hosting - **Self-hosted option** - Run on your own server for even more privacy. -- **Lightweight** - Low memory footprint (15-22MiB) even during searches. +- **Lightweight** - Low memory footprint (15-30MiB) even during searches. - **Decentralized** - No single point of failure. - **Results caching in RAM** - Faster response times through caching. - **Configurable** - Tweak features via `config.ini`. @@ -67,7 +67,7 @@ A self-hosted private <a href="https://en.wikipedia.org/wiki/Metasearch_engine"> ### Prerequisites -- Go (version 1.18 or higher recommended) +- Go (version 1.23 or higher recommended) - Git (unexpected) - Access to the internet for fetching results (even more unexpected) diff --git a/crawler-visited.go b/crawler-visited.go new file mode 100644 index 0000000..bfa1af9 --- /dev/null +++ b/crawler-visited.go @@ -0,0 +1,106 @@ +package main + +import ( + "bufio" + "fmt" + "os" + "sync" +) + +// VisitedStore handles deduplicating visited URLs with a map and a periodic flush to disk. +type VisitedStore struct { + mu sync.Mutex + visited map[string]bool + toFlush []string + + filePath string + batchSize int // how many new URLs we batch before flushing +} + +// NewVisitedStore creates or loads the visited URLs from filePath. +func NewVisitedStore(filePath string, batchSize int) (*VisitedStore, error) { + store := &VisitedStore{ + visited: make(map[string]bool), + filePath: filePath, + batchSize: batchSize, + } + + // Attempt to load existing visited URLs (if file exists). + if _, err := os.Stat(filePath); err == nil { + if err := store.loadFromFile(); err != nil { + return nil, fmt.Errorf("loadFromFile error: %w", err) + } + } + return store, nil +} + +// loadFromFile loads visited URLs from the store’s file. One URL per line. +func (s *VisitedStore) loadFromFile() error { + f, err := os.Open(s.filePath) + if err != nil { + return err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + url := scanner.Text() + s.visited[url] = true + } + return scanner.Err() +} + +// AlreadyVisited returns true if the URL is in the store. +func (s *VisitedStore) AlreadyVisited(url string) bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.visited[url] +} + +// MarkVisited adds the URL to the store if not already present, and triggers a flush if batchSize is reached. +func (s *VisitedStore) MarkVisited(url string) (added bool, err error) { + s.mu.Lock() + defer s.mu.Unlock() + + if s.visited[url] { + return false, nil + } + // Mark in memory + s.visited[url] = true + s.toFlush = append(s.toFlush, url) + + // Flush if we have enough new URLs + if len(s.toFlush) >= s.batchSize { + if err := s.flushToFileUnlocked(); err != nil { + return false, err + } + } + return true, nil +} + +// Flush everything in s.toFlush to file, then clear the buffer. +func (s *VisitedStore) Flush() error { + s.mu.Lock() + defer s.mu.Unlock() + return s.flushToFileUnlocked() +} + +// flushToFileUnlocked writes s.toFlush lines to the store file, then clears s.toFlush. +func (s *VisitedStore) flushToFileUnlocked() error { + if len(s.toFlush) == 0 { + return nil + } + f, err := os.OpenFile(s.filePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + + for _, url := range s.toFlush { + if _, err := fmt.Fprintln(f, url); err != nil { + return err + } + } + s.toFlush = nil + return nil +} diff --git a/crawler.go b/crawler.go index 45dc76f..3ddc36b 100644 --- a/crawler.go +++ b/crawler.go @@ -10,13 +10,24 @@ import ( "time" ) +// Create a global or config-level visited store +var visitedStore *VisitedStore + // webCrawlerInit is called during init on program start func webCrawlerInit() { + // Initialize the store with, say, batchSize=50 + store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), 50) + if err != nil { + printErr("Failed to initialize visited store: %v", err) + } + visitedStore = store + + // Start the periodic crawler go func() { // First run immediately runCrawlerAndIndexer() - // Then run periodically based on CrawlingInterval + // Then run periodically ticker := time.NewTicker(config.CrawlingInterval) for range ticker.C { runCrawlerAndIndexer() @@ -79,25 +90,8 @@ func readDomainsCSV(csvPath string) ([][2]string, error) { // 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh // 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error { - existingEntries := make(map[string]bool) - var mu sync.Mutex // For existingEntries + file writes - // read existing entries from outFile if it exists - if _, err := os.Stat(outFile); err == nil { - file, err := os.Open(outFile) - if err != nil { - return fmt.Errorf("unable to open %s: %v", outFile, err) - } - defer file.Close() - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - parts := strings.SplitN(line, "|", 5) - if len(parts) >= 1 { - existingEntries[parts[0]] = true - } - } - } + var mu sync.Mutex // Open file for writing (truncate if existing) file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) @@ -119,33 +113,38 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error for dom := range standardCh { rank := dom[0] domainName := dom[1] - fullURL := "https://" + domainName - - // Mark domain existing so we don't re-crawl duplicates - mu.Lock() - if domainName == "" || existingEntries[fullURL] { - mu.Unlock() + if domainName == "" { continue } - existingEntries[fullURL] = true - mu.Unlock() + fullURL := "https://" + domainName - // get a standard user agent + // 1. Check if we've already visited this URL + added, err := visitedStore.MarkVisited(fullURL) + if err != nil { + printErr("MarkVisited error for %s: %v", fullURL, err) + continue + } + if !added { + // Already visited + continue + } + + // 2. Standard extraction userAgent, _ := GetUserAgent("crawler-std") title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent) + // If missing, push to Chrome queue if title == "" || desc == "" { - // push to chromeCh chromeCh <- dom continue } - // write to file + // 3. Write to file line := fmt.Sprintf("%s|%s|%s|%s|%s\n", fullURL, title, keywords, desc, rank) mu.Lock() - file.WriteString(line) + _, _ = file.WriteString(line) mu.Unlock() } }() @@ -160,32 +159,32 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error for dom := range chromeCh { rank := dom[0] domainName := dom[1] - fullURL := "https://" + domainName - - // Mark domain existing if not already - mu.Lock() - if domainName == "" || existingEntries[fullURL] { - mu.Unlock() + if domainName == "" { continue } - existingEntries[fullURL] = true - mu.Unlock() + fullURL := "https://" + domainName - // get a chrome user agent + // We already marked it visited in the standard pass + // but you may re-check if you prefer: + // + // added, err := visitedStore.MarkVisited(fullURL) + // if err != nil { ... } + // if !added { continue } + + // 3. Chromedp fallback extraction userAgent, _ := GetUserAgent("crawler-chrome") title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent) - if title == "" || desc == "" { printWarn("Skipping (Chrome) %s: missing title/desc", fullURL) continue } - // write to file + // 4. Write to file line := fmt.Sprintf("%s|%s|%s|%s|%s\n", fullURL, title, keywords, desc, rank) mu.Lock() - file.WriteString(line) + _, _ = file.WriteString(line) mu.Unlock() } }() @@ -194,7 +193,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error // Feed domains into standardCh go func() { for _, dom := range domains { - // optionally, if maxPages is relevant, you can track how many have been processed standardCh <- dom } // close the standardCh once all are queued @@ -210,5 +208,12 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error // Wait for chrome workers to finish wgChrome.Wait() + // Optionally flush the visited store once more + if visitedStore != nil { + if err := visitedStore.Flush(); err != nil { + printErr("visitedStore flush error: %v", err) + } + } + return nil } diff --git a/go.mod b/go.mod index c8200d3..f7d89ad 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,8 @@ require ( require ( github.com/blevesearch/bleve/v2 v2.4.4 + github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb + github.com/chromedp/chromedp v0.11.2 github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f golang.org/x/net v0.33.0 ) @@ -41,8 +43,6 @@ require ( github.com/blevesearch/zapx/v14 v14.3.10 // indirect github.com/blevesearch/zapx/v15 v15.3.17 // indirect github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect - github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb // indirect - github.com/chromedp/chromedp v0.11.2 // indirect github.com/chromedp/sysutil v1.1.0 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect diff --git a/go.sum b/go.sum index 148146f..66cede6 100644 --- a/go.sum +++ b/go.sum @@ -84,6 +84,8 @@ github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8Hm github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= @@ -96,6 +98,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= From 61266c461ade647fa24b898f0b10430ae09eb56b Mon Sep 17 00:00:00 2001 From: partisan <none@noone.no> Date: Thu, 2 Jan 2025 12:55:44 +0100 Subject: [PATCH 7/9] changed indexing buffer to save to ram not to file --- config.go | 10 ++--- crawler.go | 74 +++++++++++++++------------------ indexer.go | 120 +++++++++++++++++++++++++++++++++++++++++++++++++---- init.go | 13 +++--- 4 files changed, 155 insertions(+), 62 deletions(-) diff --git a/config.go b/config.go index bdd9ccc..18d83cf 100644 --- a/config.go +++ b/config.go @@ -39,7 +39,7 @@ type Config struct { ConcurrentChromeCrawlers int CrawlingInterval time.Duration // Refres crawled results in... MaxPagesPerDomain int // Max pages to crawl per domain - IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") + IndexBatchSize int DriveCache CacheConfig RamCache CacheConfig @@ -60,7 +60,7 @@ var defaultConfig = Config{ ConcurrentChromeCrawlers: 4, CrawlingInterval: 24 * time.Hour, MaxPagesPerDomain: 10, - IndexRefreshInterval: 2 * time.Minute, + IndexBatchSize: 50, LogLevel: 1, DriveCache: CacheConfig{ Duration: 48 * time.Hour, // Added @@ -255,7 +255,7 @@ func saveConfig(config Config) { indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) - indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String()) + indexerSec.Key("IndexBatchSize").SetValue(strconv.Itoa(config.IndexBatchSize)) // DriveCache section driveSec := cfg.Section("DriveCache") @@ -303,7 +303,7 @@ func loadConfig() Config { concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi) crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) - indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration) + indexBatchSize := getConfigValue(cfg.Section("Indexer").Key("IndexBatchSize"), defaultConfig.IndexBatchSize, strconv.Atoi) // DriveCache driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration) @@ -334,7 +334,7 @@ func loadConfig() Config { ConcurrentChromeCrawlers: concurrentChromeCrawlers, CrawlingInterval: crawlingInterval, MaxPagesPerDomain: maxPagesPerDomain, - IndexRefreshInterval: indexRefreshInterval, + IndexBatchSize: indexBatchSize, DriveCache: CacheConfig{ Duration: driveDuration, MaxUsageBytes: driveMaxUsage, diff --git a/crawler.go b/crawler.go index 3ddc36b..afa7f9e 100644 --- a/crawler.go +++ b/crawler.go @@ -2,7 +2,6 @@ package main import ( "bufio" - "fmt" "os" "path/filepath" "strings" @@ -45,14 +44,20 @@ func runCrawlerAndIndexer() { } // 2. Crawl each domain and write results to data_to_index.txt - outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") - if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil { + if err := crawlDomainsToFile(domains, config.MaxPagesPerDomain); err != nil { printErr("Error crawling domains: %v", err) return } - // 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval - startPeriodicIndexing(outFile, config.IndexRefreshInterval) + // After finishing crawling, flush any pending visited-urls + if visitedStore != nil { + if err := visitedStore.Flush(); err != nil { + printErr("Failed to flush visitedStore: %v", err) + } + } + + // 3. Re-index data_to_index.txt based on IndexRefreshInterval + //startPeriodicIndexing(outFile, config.IndexRefreshInterval) printDebug("Crawl + index refresh completed.") } @@ -89,16 +94,10 @@ func readDomainsCSV(csvPath string) ([][2]string, error) { // crawlDomainsToFile does an async pipeline: // 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh // 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip -func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error { - - var mu sync.Mutex - - // Open file for writing (truncate if existing) - file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) - if err != nil { - return fmt.Errorf("unable to open %s for writing: %v", outFile, err) - } - defer file.Close() +// +// Now, instead of writing to a file, we directly index each result into Bleve via indexDocImmediately(...). +func crawlDomainsToFile(domains [][2]string, maxPages int) error { + var mu sync.Mutex // Used if needed to protect shared data. (Here mainly for visitedStore.) // Prepare channels standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking @@ -110,6 +109,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error wgStandard.Add(1) go func() { defer wgStandard.Done() + for dom := range standardCh { rank := dom[0] domainName := dom[1] @@ -118,14 +118,17 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error } fullURL := "https://" + domainName - // 1. Check if we've already visited this URL + // Mark visited so we don't re-crawl duplicates + mu.Lock() added, err := visitedStore.MarkVisited(fullURL) + mu.Unlock() + if err != nil { printErr("MarkVisited error for %s: %v", fullURL, err) continue } if !added { - // Already visited + // Already visited, skip continue } @@ -139,13 +142,11 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error continue } - // 3. Write to file - line := fmt.Sprintf("%s|%s|%s|%s|%s\n", - fullURL, title, keywords, desc, rank) - - mu.Lock() - _, _ = file.WriteString(line) - mu.Unlock() + // 3. Directly index + err = indexDocImmediately(fullURL, title, keywords, desc, rank) + if err != nil { + printErr("Index error for %s: %v", fullURL, err) + } } }() } @@ -156,6 +157,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error wgChrome.Add(1) go func() { defer wgChrome.Done() + for dom := range chromeCh { rank := dom[0] domainName := dom[1] @@ -164,28 +166,19 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error } fullURL := "https://" + domainName - // We already marked it visited in the standard pass - // but you may re-check if you prefer: - // - // added, err := visitedStore.MarkVisited(fullURL) - // if err != nil { ... } - // if !added { continue } - // 3. Chromedp fallback extraction userAgent, _ := GetUserAgent("crawler-chrome") title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent) if title == "" || desc == "" { - printWarn("Skipping (Chrome) %s: missing title/desc", fullURL) + printWarn("Skipping %s: unable to get title/desc data", fullURL) continue } - // 4. Write to file - line := fmt.Sprintf("%s|%s|%s|%s|%s\n", - fullURL, title, keywords, desc, rank) - - mu.Lock() - _, _ = file.WriteString(line) - mu.Unlock() + // 4. Directly index the doc + err := indexDocImmediately(fullURL, title, keywords, desc, rank) + if err != nil { + printErr("Index error for %s: %v", fullURL, err) + } } }() } @@ -195,7 +188,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error for _, dom := range domains { standardCh <- dom } - // close the standardCh once all are queued close(standardCh) }() @@ -208,7 +200,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error // Wait for chrome workers to finish wgChrome.Wait() - // Optionally flush the visited store once more + // Flush visitedStore if visitedStore != nil { if err := visitedStore.Flush(); err != nil { printErr("visitedStore flush error: %v", err) diff --git a/indexer.go b/indexer.go index 306c28d..73ca9e3 100644 --- a/indexer.go +++ b/indexer.go @@ -8,6 +8,7 @@ import ( "path/filepath" "strconv" "strings" + "sync" "time" "github.com/blevesearch/bleve/v2" @@ -26,22 +27,123 @@ type Document struct { var ( // Global Bleve index handle - bleveIndex bleve.Index + bleveIndex bleve.Index + docBuffer []Document + docBufferMu sync.Mutex ) -// startPeriodicIndexing refreshes the index from a file periodically -func startPeriodicIndexing(filePath string, interval time.Duration) { +// // startPeriodicIndexing refreshes the index from a file periodically +// func startPeriodicIndexing(filePath string, interval time.Duration) { +// go func() { +// for { +// printDebug("Refreshing index from %s", filePath) +// if err := IndexFile(filePath); err != nil { +// printErr("Failed to refresh index: %v", err) +// } +// time.Sleep(interval) +// } +// }() +// } + +// indexDocImmediately indexes a single document into the Bleve index. +func indexDocImmediately(link, title, tags, desc, rank string) error { + pop, _ := strconv.ParseInt(rank, 10, 64) + normalized := normalizeDomain(link) + + doc := Document{ + ID: normalized, + Link: link, + Title: title, + Tags: tags, + Description: desc, + Popularity: pop, + } + + // Insert directly into the Bleve index + err := bleveIndex.Index(doc.ID, map[string]interface{}{ + "title": doc.Title, + "description": doc.Description, + "link": doc.Link, + "tags": doc.Tags, + "popularity": doc.Popularity, + }) + if err != nil { + return fmt.Errorf("failed to index doc %s: %v", link, err) + } + return nil +} + +// StartBatchIndexing spawns a goroutine that flushes the buffer every interval. +func StartBatchIndexing() { go func() { - for { - printDebug("Refreshing index from %s", filePath) - if err := IndexFile(filePath); err != nil { - printErr("Failed to refresh index: %v", err) - } - time.Sleep(interval) + ticker := time.NewTicker(config.IndexRefreshInterval) + defer ticker.Stop() + + for range ticker.C { + flushDocBuffer() } }() } +func flushDocBuffer() { + docBufferMu.Lock() + defer docBufferMu.Unlock() + + if len(docBuffer) == 0 { + return + } + + batch := bleveIndex.NewBatch() + for _, doc := range docBuffer { + err := batch.Index(doc.ID, map[string]interface{}{ + "title": doc.Title, + "description": doc.Description, + "link": doc.Link, + "tags": doc.Tags, + "popularity": doc.Popularity, + }) + if err != nil { + printErr("batch index error for %s: %v", doc.Link, err) + } + } + // Attempt to commit the batch + if err := bleveIndex.Batch(batch); err != nil { + printErr("error committing batch: %v", err) + } + + // Clear the buffer + docBuffer = docBuffer[:0] +} + +// indexDocBatch queues a single document into memory, which gets flushed by the ticker. +func indexDocBatch(link, title, tags, desc, rank string) error { + pop, _ := strconv.ParseInt(rank, 10, 64) + normalized := normalizeDomain(link) + + doc := Document{ + ID: normalized, + Link: link, + Title: title, + Tags: tags, + Description: desc, + Popularity: pop, + } + + docBufferMu.Lock() + docBuffer = append(docBuffer, doc) + + // Optional: if we exceed config.IndexBatchSize, flush immediately + if len(docBuffer) >= config.IndexBatchSize { + go func() { + // flush in a separate goroutine to avoid blocking + flushDocBuffer() + }() + } + docBufferMu.Unlock() + + return nil +} + // InitIndex ensures that the Bleve index is created or opened. func InitIndex() error { idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve")) diff --git a/init.go b/init.go index 7a6dba2..666d93a 100644 --- a/init.go +++ b/init.go @@ -3,8 +3,6 @@ package main import ( "flag" "os" - "path/filepath" - "time" ) var config Config @@ -109,16 +107,17 @@ func main() { return } - webCrawlerInit() - err := InitIndex() if err != nil { printErr("Failed to initialize index:", err) } - // Start periodic indexing (every 2 minutes) - dataFilePath := filepath.Join(config.DriveCache.Path, "data_to_index.txt") - startPeriodicIndexing(dataFilePath, 2*time.Minute) + webCrawlerInit() + + // No longer needed as crawled data are indexed imidietly + // // Start periodic indexing (every 2 minutes) + // dataFilePath := filepath.Join(config.DriveCache.Path, "data_to_index.txt") + // startPeriodicIndexing(dataFilePath, 2*time.Minute) printInfo("Indexer is enabled.") } else { From 5ae97da6d086556a86bcaca4a36885735c9fd58b Mon Sep 17 00:00:00 2001 From: partisan <none@noone.no> Date: Sun, 5 Jan 2025 19:23:53 +0100 Subject: [PATCH 8/9] added privacy policy page and about section, improved dir check, fixed crash when idexer is disabled --- crawler.go | 5 +- indexer.go | 26 +++-- init.go | 18 ++- main.go | 45 ++++++++ static/css/style-imageviewer.css | 18 +-- static/css/style-menu.css | 104 +++++++++++++++++ static/css/style-privacy.css | 95 ++++++++++++++++ static/js/imageviewer.js | 2 +- static/js/minimenu.js | 9 ++ templates/files.html | 23 +++- templates/forums.html | 23 +++- templates/images.html | 23 +++- templates/map.html | 23 +++- templates/privacy.html | 133 ++++++++++++++++++++++ templates/search.html | 23 +++- templates/text.html | 23 +++- templates/videos.html | 23 +++- user-settings.go | 189 ++++++++++++++++++++----------- 18 files changed, 698 insertions(+), 107 deletions(-) create mode 100644 static/css/style-privacy.css create mode 100644 templates/privacy.html diff --git a/crawler.go b/crawler.go index afa7f9e..8caa073 100644 --- a/crawler.go +++ b/crawler.go @@ -14,8 +14,7 @@ var visitedStore *VisitedStore // webCrawlerInit is called during init on program start func webCrawlerInit() { - // Initialize the store with, say, batchSize=50 - store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), 50) + store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), config.IndexBatchSize) if err != nil { printErr("Failed to initialize visited store: %v", err) } @@ -170,7 +169,7 @@ func crawlDomainsToFile(domains [][2]string, maxPages int) error { userAgent, _ := GetUserAgent("crawler-chrome") title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent) if title == "" || desc == "" { - printWarn("Skipping %s: unable to get title/desc data", fullURL) + printDebug("Skipping %s: unable to get title/desc data", fullURL) // Here is print for all domains that fail to be crawled continue } diff --git a/indexer.go b/indexer.go index 73ca9e3..c8cf6fe 100644 --- a/indexer.go +++ b/indexer.go @@ -9,7 +9,6 @@ import ( "strconv" "strings" "sync" - "time" "github.com/blevesearch/bleve/v2" "golang.org/x/net/publicsuffix" @@ -73,17 +72,17 @@ func indexDocImmediately(link, title, tags, desc, rank string) error { return nil } -// StartBatchIndexing spawns a goroutine that flushes the buffer every interval. -func StartBatchIndexing() { - go func() { - ticker := time.NewTicker(config.IndexRefreshInterval) - defer ticker.Stop() +// // StartBatchIndexing spawns a goroutine that flushes the buffer every interval. +// func StartBatchIndexing() { +// go func() { +// ticker := time.NewTicker(config.IndexRefreshInterval) +// defer ticker.Stop() - for range ticker.C { - flushDocBuffer() - } - }() -} +// for range ticker.C { +// flushDocBuffer() +// } +// }() +// } func flushDocBuffer() { docBufferMu.Lock() @@ -264,6 +263,11 @@ func IndexFile(filePath string) error { // SearchIndex performs a full-text search on the indexed data. func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) { + // Check if the indexer is enabled + if !config.IndexerEnabled { + return nil, fmt.Errorf("indexer is disabled") + } + exactMatch := bleve.NewMatchQuery(queryStr) // Exact match fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match fuzzyMatch.Fuzziness = 2 diff --git a/init.go b/init.go index 666d93a..bf0d220 100644 --- a/init.go +++ b/init.go @@ -3,6 +3,7 @@ package main import ( "flag" "os" + "path/filepath" ) var config Config @@ -77,9 +78,18 @@ func main() { // Check if the cache directory exists when caching is enabled if config.DriveCacheEnabled { - if _, err := os.Stat(config.DriveCache.Path); os.IsNotExist(err) { - printErr("Error: Drive cache is enabled, but cache directory '%s' does not exist.\n", config.DriveCache.Path) - os.Exit(1) // Exit with a non-zero status to indicate an error + cacheDir := config.DriveCache.Path + imagesDir := filepath.Join(cacheDir, "images") + + // Check if the directory already exists + if _, err := os.Stat(imagesDir); os.IsNotExist(err) { + // Try to create the directory since it doesn't exist + if err := os.MkdirAll(imagesDir, os.ModePerm); err != nil { + printErr("Error: Failed to create cache or images directory '%s': %v", imagesDir, err) + os.Exit(1) // Exit with a non-zero status to indicate an error + } + // Print a warning if the directory had to be created + printWarn("Warning: Created missing directory '%s'.", imagesDir) } } @@ -109,7 +119,7 @@ func main() { err := InitIndex() if err != nil { - printErr("Failed to initialize index:", err) + printErr("Failed to initialize index: %v", err) } webCrawlerInit() diff --git a/main.go b/main.go index cc6b8c3..12c2381 100755 --- a/main.go +++ b/main.go @@ -221,6 +221,7 @@ func runServer() { http.HandleFunc("/save-settings", handleSaveSettings) http.HandleFunc("/image/", handleImageServe) http.HandleFunc("/image_status", handleImageStatus) + http.HandleFunc("/privacy", handlePrivacyPage) http.HandleFunc("/opensearch.xml", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/opensearchdescription+xml") http.ServeFile(w, r, "static/opensearch.xml") @@ -235,6 +236,7 @@ func runServer() { http.HandleFunc("/save-settings", handleWebsiteDisabled) http.HandleFunc("/image/", handleWebsiteDisabled) http.HandleFunc("/image_status", handleWebsiteDisabled) + http.HandleFunc("/privacy", handleWebsiteDisabled) http.HandleFunc("/opensearch.xml", handleWebsiteDisabled) printInfo("Website functionality disabled.") } @@ -252,3 +254,46 @@ func handleWebsiteDisabled(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusServiceUnavailable) _, _ = w.Write([]byte("The website functionality is currently disabled.")) } + +func handlePrivacyPage(w http.ResponseWriter, r *http.Request) { + settings := loadUserSettings(w, r) + iconPathSVG, iconPathPNG := GetIconPath() + + // Define the data structure for the template + data := struct { + Theme string + IconPathSVG string + IconPathPNG string + IsThemeDark bool + CookieRows []CookieRow + CurrentLang string + Safe string + LanguageOptions []LanguageOption + }{ + Theme: settings.Theme, + IconPathSVG: iconPathSVG, + IconPathPNG: iconPathPNG, + IsThemeDark: settings.IsThemeDark, + CookieRows: generateCookieTable(r), + CurrentLang: settings.SiteLanguage, + Safe: settings.SafeSearch, + LanguageOptions: languageOptions, + } + + // Parse the template + tmpl, err := template.New("privacy.html").ParseFiles("templates/privacy.html") + if err != nil { + log.Printf("Error parsing template: %v", err) + http.Error(w, "Internal Server Error", http.StatusInternalServerError) + return + } + + // Set the response content type + w.Header().Set("Content-Type", "text/html; charset=utf-8") + + // Execute the template + if err := tmpl.Execute(w, data); err != nil { + log.Printf("Error executing template: %v", err) + http.Error(w, "Internal Server Error", http.StatusInternalServerError) + } +} diff --git a/static/css/style-imageviewer.css b/static/css/style-imageviewer.css index 4c0696f..ac6874a 100644 --- a/static/css/style-imageviewer.css +++ b/static/css/style-imageviewer.css @@ -60,13 +60,6 @@ gap: 5px; /* Add spacing between buttons */ } -.image-view-close .btn-nostyle { - background-color: inherit; - border: none; - padding: 0px; - cursor: pointer; -} - #viewer-close-button, #viewer-prev-button, #viewer-next-button { @@ -128,6 +121,7 @@ .full-size:hover, .proxy-size:hover { + transition: all 0.3s ease; text-decoration: underline; } @@ -136,15 +130,6 @@ visibility: visible; } -/* Button No Style */ -.btn-nostyle { - background-color: inherit; - border: none; - padding: 0px; - width: fit-content; - cursor: pointer; -} - /* Image Navigation Icons */ .image-close, .image-next, @@ -163,6 +148,7 @@ .image-close:hover, .image-next:hover, .image-before:hover { + transition: all 0.3s ease; background-color: var(--image-select); } diff --git a/static/css/style-menu.css b/static/css/style-menu.css index d85810b..95be6cf 100644 --- a/static/css/style-menu.css +++ b/static/css/style-menu.css @@ -1,3 +1,5 @@ +/* ------------------ Mini-Menu Styles ------------------ */ + .settings-search-div-search { right: 20px; top: 25px; @@ -140,4 +142,106 @@ margin-right: 0; border-radius: 0; } +} + +/* ------------------ About QGato Modal Styles ------------------ */ + +#aboutQGatoModal { + display: none; + position: fixed; + /* Center modal */ + top: 50%; + left: 50%; + transform: translate(-50%, -50%); + + /* Keep it on top */ + z-index: 999; + + /* Match mini-menu background style */ + background-color: var(--html-bg); + box-shadow: 0 1px 3px rgba(0, 0, 0, 0.12), 0 1px 2px rgba(0, 0, 0, 0.24); + border: 1px solid var(--border); + border-radius: 12px; + + /* Spacing & sizing */ + padding: 32px; + max-width: 600px; /* Increased width */ + max-height: 80vh; /* Optional: restrict height to 80% of viewport */ + overflow-y: auto; /* Enable scrolling if content exceeds height */ + color: var(--font-fg); +} + +#aboutQGatoModal #close-button { + position: absolute; + top: 12px; + right: 12px; /* Moved close button to top-right */ +} + +#aboutQGatoModal .modal-content { + text-align: center; + margin-top: 20px; /* Adjusted spacing */ +} + +/* Logo */ +#aboutQGatoModal .modal-content img { + width: 100px; /* Increased logo size */ + margin-bottom: 16px; +} + +/* Headings, paragraphs, etc. */ +#aboutQGatoModal .modal-content h2 { + font-size: 2rem; /* Larger heading */ + margin: 8px 0; +} + +#aboutQGatoModal .modal-content p { + font-size: 1.1rem; /* Larger paragraph text */ + margin: 12px 0; +} + +/* Container for the Source Code / Privacy Policy buttons */ +#aboutQGatoModal .button-container { + margin-top: 16px; + display: flex; + justify-content: center; + gap: 16px; +} + +/* Match mini-menu button style as closely as possible */ +#aboutQGatoModal .button-container button { + background-color: var(--button); + color: var(--font-fg); + border: 1px solid var(--border); + border-radius: 6px; + padding: 12px 16px; /* Larger button padding */ + font-size: 1rem; /* Larger button text */ + cursor: pointer; + transition: border 0.3s ease, background-color 0.3s ease, color 0.3s ease; +} + +#aboutQGatoModal .button-container button:hover { + border: 1px solid var(--font-fg); +} + +/* Close Button Style */ +.cloase-btn { + font-size: 1.5rem; /* Larger close button */ + color: var(--search-button); + border-radius: 50%; + padding: 8px; +} + +.cloase-btn:hover { + transition: all 0.3s ease; + background-color: var(--image-select); +} + +/* ------------------ Common Button No Style ------------------ */ + +.btn-nostyle { + background-color: inherit; + border: none; + padding: 0px; + width: fit-content; + cursor: pointer; } \ No newline at end of file diff --git a/static/css/style-privacy.css b/static/css/style-privacy.css new file mode 100644 index 0000000..5cfef4b --- /dev/null +++ b/static/css/style-privacy.css @@ -0,0 +1,95 @@ +/* Main content wrapper */ +.privacy-content-wrapper { + max-width: 800px; + margin: 80px auto 40px auto; + padding: 0 20px; +} + +/* Header section */ +.privacy-header { + text-align: center; + margin-bottom: 30px; +} + +.privacy-header h1 { + font-size: 2rem; + margin: 0; + color: var(--font-fg); +} + +.privacy-header p { + color: var(--fg); + margin-top: 10px; + font-size: 1.1rem; +} + +/* Section headings */ +.privacy-section h2 { + font-size: 1.5rem; + margin-bottom: 8px; + color: var(--font-fg); + border-bottom: 1px solid var(--border); + padding-bottom: 4px; +} + +/* Section text */ +.privacy-section p { + font-size: 1rem; + line-height: 1.6; + margin-bottom: 20px; + color: var(--fg); +} + +/* Footer */ +.privacy-footer { + text-align: center; + padding: 10px 0; + border-top: 1px solid var(--border); + color: var(--fg); + background-color: var(--html-bg); +} + +/* Links */ +.privacy-section a { + color: var(--link); + text-decoration: none; +} + +.privacy-section a:hover { + text-decoration: underline; +} + +/* Table styling */ +.cookie-table { + width: 100%; + margin: 20px auto; + border-collapse: collapse; + text-align: left; + font-size: 1rem; + color: var(--fg); + background-color: var(--html-bg); + border: 1px solid var(--border); +} + +.cookie-table th, +.cookie-table td { + padding: 12px 15px; + border: 1px solid var(--border); +} + +.cookie-table th { + background-color: var(--search-bg); + color: var(--font-fg); + text-align: center; + font-weight: bold; +} + +.cookie-table tr:nth-child(even) { + background-color: var(--snip-background); +} + +/* Center the table within its section */ +.privacy-section .cookie-table { + margin-left: auto; + margin-right: auto; +} diff --git a/static/js/imageviewer.js b/static/js/imageviewer.js index a68f0e2..4bd667f 100644 --- a/static/js/imageviewer.js +++ b/static/js/imageviewer.js @@ -13,7 +13,7 @@ document.addEventListener('DOMContentLoaded', function() { // Set the innerHTML of viewerOverlay viewerOverlay.innerHTML = ` <div id="image-viewer" class="image_view image_hide"> - <div class="image-view-close"> + <div class="btn-nostyle"> <button class="btn-nostyle" id="viewer-prev-button"> <div class="material-icons-round icon_visibility clickable image-before"></div> <!-- navigate_before --> </button> diff --git a/static/js/minimenu.js b/static/js/minimenu.js index c1c8a39..4044edb 100644 --- a/static/js/minimenu.js +++ b/static/js/minimenu.js @@ -44,4 +44,13 @@ document.addEventListener('DOMContentLoaded', function () { document.getElementById('languageSelect').addEventListener('change', function () { updateSettings('lang', this.value); }); + + // Show/Hide About QGato + document.getElementById('aboutQGatoBtn').addEventListener('click', function() { + document.getElementById('aboutQGatoModal').style.display = 'block'; + }); + + document.getElementById('close-button').addEventListener('click', function() { + document.getElementById('aboutQGatoModal').style.display = 'none'; + }); }); \ No newline at end of file diff --git a/templates/files.html b/templates/files.html index 0d9c7c4..a47bf4e 100755 --- a/templates/files.html +++ b/templates/files.html @@ -43,7 +43,7 @@ <option value="{{.Code}}" {{if eq .Code $.CurrentLang}}selected{{end}}>{{.Name}}</option> {{end}} </select> - <!-- <button id="settingsButton" onclick="window.location.href='/about'">About QGato</button> --> + <button id="aboutQGatoBtn">About QGato</button> </div> </div> </div> @@ -53,6 +53,27 @@ </div> </noscript> + <!-- Popup Modal for QGato --> + <div id="aboutQGatoModal"> + <!-- Close Button --> + <button class="btn-nostyle" id="close-button"> + <div class="material-icons-round icon_visibility clickable cloase-btn"></div> + </button> + + <div class="modal-content"> + <img + src="/static/images/icon.svg" + alt="QGato" + > + <h2>QGato</h2> + <p>A open-source private search engine.</p> + <div class="button-container"> + <button onclick="window.location.href='https://weforge.xyz/Spitfire/Search'">Source Code</button> + <button onclick="window.location.href='/privacy'">Privacy policy</button> + </div> + </div> + </div> + <form action="/search" id="prev-next-form" class="results-search-container" method="GET" autocomplete="off"> <h1 class="logomobile"> <div class="logo-container" herf="/"> diff --git a/templates/forums.html b/templates/forums.html index f5d91f8..7b9d6dd 100755 --- a/templates/forums.html +++ b/templates/forums.html @@ -43,7 +43,7 @@ <option value="{{.Code}}" {{if eq .Code $.CurrentLang}}selected{{end}}>{{.Name}}</option> {{end}} </select> - <!-- <button id="settingsButton" onclick="window.location.href='/about'">About QGato</button> --> + <button id="aboutQGatoBtn">About QGato</button> </div> </div> </div> @@ -53,6 +53,27 @@ </div> </noscript> + <!-- Popup Modal for QGato --> + <div id="aboutQGatoModal"> + <!-- Close Button --> + <button class="btn-nostyle" id="close-button"> + <div class="material-icons-round icon_visibility clickable cloase-btn"></div> + </button> + + <div class="modal-content"> + <img + src="/static/images/icon.svg" + alt="QGato" + > + <h2>QGato</h2> + <p>A open-source private search engine.</p> + <div class="button-container"> + <button onclick="window.location.href='https://weforge.xyz/Spitfire/Search'">Source Code</button> + <button onclick="window.location.href='/privacy'">Privacy policy</button> + </div> + </div> + </div> + <form action="/search" id="prev-next-form" class="results-search-container" method="GET" autocomplete="off"> <h1 class="logomobile"> <div class="logo-container" herf="/"> diff --git a/templates/images.html b/templates/images.html index cfdcdea..1bb91b7 100755 --- a/templates/images.html +++ b/templates/images.html @@ -52,7 +52,7 @@ <option value="{{.Code}}" {{if eq .Code $.CurrentLang}}selected{{end}}>{{.Name}}</option> {{end}} </select> - <!-- <button id="settingsButton" onclick="window.location.href='/about'">About QGato</button> --> + <button id="aboutQGatoBtn">About QGato</button> </div> </div> </div> @@ -61,6 +61,27 @@ <a href="/settings" class="material-icons-round clickable settings-icon-link settings-icon-link-search"></a> </div> </noscript> + + <!-- Popup Modal for QGato --> + <div id="aboutQGatoModal"> + <!-- Close Button --> + <button class="btn-nostyle" id="close-button"> + <div class="material-icons-round icon_visibility clickable cloase-btn"></div> + </button> + + <div class="modal-content"> + <img + src="/static/images/icon.svg" + alt="QGato" + > + <h2>QGato</h2> + <p>A open-source private search engine.</p> + <div class="button-container"> + <button onclick="window.location.href='https://weforge.xyz/Spitfire/Search'">Source Code</button> + <button onclick="window.location.href='/privacy'">Privacy policy</button> + </div> + </div> + </div> <form action="/search" id="prev-next-form" class="results-search-container" method="GET" autocomplete="off"> <h1 class="logomobile"> diff --git a/templates/map.html b/templates/map.html index b75f915..054f910 100644 --- a/templates/map.html +++ b/templates/map.html @@ -58,7 +58,7 @@ <option value="{{.Code}}" {{if eq .Code $.CurrentLang}}selected{{end}}>{{.Name}}</option> {{end}} </select> - <!-- <button id="settingsButton" onclick="window.location.href='/about'">About QGato</button> --> + <button id="aboutQGatoBtn">About QGato</button> </div> </div> </div> @@ -68,6 +68,27 @@ </div> </noscript> + <!-- Popup Modal for QGato --> + <div id="aboutQGatoModal"> + <!-- Close Button --> + <button class="btn-nostyle" id="close-button"> + <div class="material-icons-round icon_visibility clickable cloase-btn"></div> + </button> + + <div class="modal-content"> + <img + src="/static/images/icon.svg" + alt="QGato" + > + <h2>QGato</h2> + <p>A open-source private search engine.</p> + <div class="button-container"> + <button onclick="window.location.href='https://weforge.xyz/Spitfire/Search'">Source Code</button> + <button onclick="window.location.href='/privacy'">Privacy policy</button> + </div> + </div> + </div> + <form action="/search" id="prev-next-form" class="results-search-container" method="GET" autocomplete="off"> <h1 class="logomobile"> <div class="logo-container" herf="/"> diff --git a/templates/privacy.html b/templates/privacy.html new file mode 100644 index 0000000..ca55401 --- /dev/null +++ b/templates/privacy.html @@ -0,0 +1,133 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>Privacy Policy + + + + + + + + + + + + + + + +
+ +
+

Settings

+
+ +
+

Current theme: {{.Theme}}

+
+
Dark Theme
+
Light Theme
+
+
+ + + +
+
+
+
+ +
+ + +
+ + + + +
+ + +
+
+

Privacy Policy

+

Your privacy is important to us. This page outlines our practices.

+
+ +
+

Introduction

+

This website is a Free and Open Source Software (FOSS) project licensed under the AGPL-3.0 license. The project is committed to providing a private and secure experience for all users.

+
+ +
+

Data Collection

+

Our servers do not collect any user data, including IP addresses, browsing history, or any other identifiable information. We respect your privacy and ensure that no user information is logged or stored on our servers.

+
+ +
+

Cookies Used

+

Our cookies are not used to track users or sell user data, they are just used to save your settings.

+

These following cookies are used by this site:

+ + + + + + + + + + + {{ range .CookieRows }} + + + + + + + {{ end }} + + +
+ +
+ + + + + + + + diff --git a/templates/search.html b/templates/search.html index d5d8129..44445fe 100755 --- a/templates/search.html +++ b/templates/search.html @@ -58,7 +58,7 @@ {{end}} - + @@ -66,6 +66,27 @@ + +
+ + + + +
+
diff --git a/templates/text.html b/templates/text.html index 95fde16..1cbccc2 100755 --- a/templates/text.html +++ b/templates/text.html @@ -43,7 +43,7 @@ {{end}} - +
@@ -53,6 +53,27 @@ + +
+ + + + +
+

diff --git a/templates/videos.html b/templates/videos.html index 48bf0f1..15188ac 100644 --- a/templates/videos.html +++ b/templates/videos.html @@ -43,7 +43,7 @@ {{end}} - +
@@ -53,6 +53,27 @@ + +
+ + + + +
+

diff --git a/user-settings.go b/user-settings.go index a18478d..a872f11 100755 --- a/user-settings.go +++ b/user-settings.go @@ -18,44 +18,44 @@ func loadUserSettings(w http.ResponseWriter, r *http.Request) UserSettings { var settings UserSettings saveRequired := false - // Load theme - if cookie, err := r.Cookie("theme"); err == nil { - settings.Theme = cookie.Value - } else { - settings.Theme = "dark" - saveRequired = true - } - - // Determine if the selected theme is dark - settings.IsThemeDark = settings.Theme == "dark" || settings.Theme == "night" || settings.Theme == "black" || settings.Theme == "latte" - - // Load site language - if cookie, err := r.Cookie("site_language"); err == nil { - settings.SiteLanguage = cookie.Value - } else { - // If no site language is set, use Accept-Language or default to "en" - acceptLang := r.Header.Get("Accept-Language") - if acceptLang != "" { - settings.SiteLanguage = normalizeLangCode(strings.Split(acceptLang, ",")[0]) + for _, cd := range AllCookies { + // Attempt to read the cookie + if cookie, err := r.Cookie(cd.Name); err == nil { + // Use SetValue to update the correct UserSettings field + cd.SetValue(&settings, cookie.Value) } else { - settings.SiteLanguage = "en" // Default language + // If cookie is missing and you want a default value, set it here + switch cd.Name { + case "theme": + // Default theme to "dark" if missing + cd.SetValue(&settings, "dark") + saveRequired = true + case "site_language": + // Fallback to Accept-Language or "en" + acceptLang := r.Header.Get("Accept-Language") + if acceptLang != "" { + cd.SetValue(&settings, normalizeLangCode(acceptLang)) + } else { + cd.SetValue(&settings, "en") + } + saveRequired = true + case "safe": + // Default safe to "" + cd.SetValue(&settings, "") + saveRequired = true + // etc. for other cookies if needed + } } - saveRequired = true } - // Load search language (can be empty) - if cookie, err := r.Cookie("search_language"); err == nil { - settings.SearchLanguage = cookie.Value - } - - // Load safe search - if cookie, err := r.Cookie("safe"); err == nil { - settings.SafeSearch = cookie.Value - } else { - settings.SafeSearch = "" - saveRequired = true - } + // If theme was set, update IsThemeDark just to be sure + // Alternatively do it inside SetValue for "theme" + settings.IsThemeDark = settings.Theme == "dark" || + settings.Theme == "night" || + settings.Theme == "black" || + settings.Theme == "latte" + // Save any new default cookies that might have been triggered if saveRequired { saveUserSettings(w, settings) } @@ -66,38 +66,16 @@ func loadUserSettings(w http.ResponseWriter, r *http.Request) UserSettings { func saveUserSettings(w http.ResponseWriter, settings UserSettings) { expiration := time.Now().Add(90 * 24 * time.Hour) - http.SetCookie(w, &http.Cookie{ - Name: "theme", - Value: settings.Theme, - Path: "/", - Expires: expiration, - Secure: true, - SameSite: http.SameSiteStrictMode, - }) - http.SetCookie(w, &http.Cookie{ - Name: "site_language", - Value: settings.SiteLanguage, - Path: "/", - Expires: expiration, - Secure: true, - SameSite: http.SameSiteStrictMode, - }) - http.SetCookie(w, &http.Cookie{ - Name: "search_language", - Value: settings.SearchLanguage, - Path: "/", - Expires: expiration, - Secure: true, - SameSite: http.SameSiteStrictMode, - }) - http.SetCookie(w, &http.Cookie{ - Name: "safe", - Value: settings.SafeSearch, - Path: "/", - Expires: expiration, - Secure: true, - SameSite: http.SameSiteStrictMode, - }) + for _, cd := range AllCookies { + http.SetCookie(w, &http.Cookie{ + Name: cd.Name, + Value: cd.GetValue(settings), + Path: "/", + Expires: expiration, + Secure: true, + SameSite: http.SameSiteStrictMode, + }) + } printDebug("settings saved: %v", settings) } @@ -193,3 +171,84 @@ func isValidLangCode(lang string) bool { } return false } + +// CookieDefinition describes how a single cookie is handled +type CookieDefinition struct { + Name string + // GetValue extracts the corresponding field from UserSettings + GetValue func(UserSettings) string + // SetValue updates the corresponding field in UserSettings + SetValue func(*UserSettings, string) + // Description used in privacy table or docs + Description string +} + +// AllCookies defines every cookie we handle in a single slice. +// Add or remove entries here, and the rest updates automatically. +var AllCookies = []CookieDefinition{ + { + Name: "theme", + Description: "Stores the selected theme (dark, light, etc.)", + GetValue: func(s UserSettings) string { + return s.Theme + }, + SetValue: func(s *UserSettings, val string) { + s.Theme = val + s.IsThemeDark = (val == "dark" || val == "night" || val == "black" || val == "latte") + }, + }, + { + Name: "site_language", + Description: "Stores the preferred site language.", + GetValue: func(s UserSettings) string { + return s.SiteLanguage + }, + SetValue: func(s *UserSettings, val string) { + s.SiteLanguage = val + }, + }, + { + Name: "search_language", + Description: "Stores the preferred language for search results.", + GetValue: func(s UserSettings) string { + return s.SearchLanguage + }, + SetValue: func(s *UserSettings, val string) { + s.SearchLanguage = val + }, + }, + { + Name: "safe", + Description: "Stores the Safe Search setting.", + GetValue: func(s UserSettings) string { + return s.SafeSearch + }, + SetValue: func(s *UserSettings, val string) { + s.SafeSearch = val + }, + }, +} + +type CookieRow struct { + Name string + Value string + Description string + Expiration string +} + +func generateCookieTable(r *http.Request) []CookieRow { + var rows []CookieRow + for _, cd := range AllCookies { + value := "[Not Set]" + if cookie, err := r.Cookie(cd.Name); err == nil { + value = cookie.Value + } + rows = append(rows, CookieRow{ + Name: cd.Name, + Value: value, + Description: cd.Description, + Expiration: "90 days", + }) + } + return rows +} From 87000358933f8f91f6c3109785fbab0485a19d41 Mon Sep 17 00:00:00 2001 From: partisan Date: Sun, 5 Jan 2025 20:27:13 +0100 Subject: [PATCH 9/9] fixed 'no more results' text --- files.go | 2 +- forums.go | 3 ++- static/css/style.css | 7 +++++-- templates/files.html | 8 +++++--- templates/forums.html | 7 +++++-- templates/images.html | 7 +++++-- templates/text.html | 7 +++++-- templates/videos.html | 11 ++++++++--- video.go | 1 + 9 files changed, 37 insertions(+), 16 deletions(-) diff --git a/files.go b/files.go index 1755143..d0c1ff1 100755 --- a/files.go +++ b/files.go @@ -56,7 +56,7 @@ func handleFileSearch(w http.ResponseWriter, settings UserSettings, query string "Category": "all", "Sort": "seed", "Page": page, - "HasPrevPage": page > 1, + "HasPrevPage": page >= 1, "HasNextPage": len(combinedResults) > 0, "LanguageOptions": languageOptions, "CurrentLang": settings.SearchLanguage, diff --git a/forums.go b/forums.go index 973c070..bd57e55 100755 --- a/forums.go +++ b/forums.go @@ -118,7 +118,8 @@ func handleForumsSearch(w http.ResponseWriter, settings UserSettings, query stri "Page": page, "Fetched": fmt.Sprintf("%.2f %s", elapsedTime.Seconds(), Translate("seconds")), // Time for fetching results "HasPrevPage": page > 1, - "HasNextPage": len(results) == 25, // Assuming 25 results per page + "HasNextPage": len(results) >= 25, + "NoResults": len(results) == 0, "LanguageOptions": languageOptions, "CurrentLang": settings.SearchLanguage, "Theme": settings.Theme, diff --git a/static/css/style.css b/static/css/style.css index 32ad383..e4b1cd6 100644 --- a/static/css/style.css +++ b/static/css/style.css @@ -1158,17 +1158,20 @@ p { border-radius: 8px; position: relative; border: 1px solid var(--snip-border); - margin-left: 175px; color: var(--fg); width: 530px; padding: 15px; margin-bottom: 627px; - margin-top: -20px; + margin-top: 20px; font-size: 14px; line-height: 1.58; letter-spacing: normal; } +.no-results-found-offset { + margin-left: 175px; +} + /* MAP */ .message { diff --git a/templates/files.html b/templates/files.html index a47bf4e..ff35355 100755 --- a/templates/files.html +++ b/templates/files.html @@ -173,12 +173,14 @@ {{ end }}
- {{ else }} -
+ {{else if .NoResults}} +
{{ translate "no_results_found" .Query }}
{{ translate "suggest_rephrase" }}
- {{ end }} + {{else}} +
{{ translate "no_more_results" }}
+ {{end}}