diff --git a/README.md b/README.md index 23e8bf5..5ad3337 100644 --- a/README.md +++ b/README.md @@ -7,30 +7,30 @@

-A self-hosted private metasearch engine that aims to be more resource-efficient than its competition. +A self-hosted private search engine designed to be scalable and more resource-efficient than its competitors.

# Bare in mind that this project is still WIP -## Comparison to other search engines +## Comparison to other open-source search engines -| Feature | Whoogle [1] | Araa-Search | LibreY | 4get | SearchXNG | *QGato* | -| :------------------------- | ------------------ | ------------------------- | ------------------------ | ------------------------ | ------------------------- | ---------------------------------------------------- | -| Works without JavaScript | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Music search | ❓ | ❌ | ❌ | ✅ | ✅ | ✅ | -| Torrent search | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | -| API | ❌ | ❓ [2] | ✅ | ✅ | ✅ | ✅ | -| Scalable | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | -| Not Resource Hungry | ❓ Moderate | ❌ Very resource hungry | ❌ Moderate 200-400mb~ | ❌ Moderate 200-400mb~ | ❌ Moderate 200-300MiB~ | ✅ about 15-20MiB at idle, 17-22MiB when searching | -| Result caching | ❌ | ❌ | ❓ | ❓ | ❓ | ✅ | -| Dynamic Page Loading | ❓ Not specified | ❌ | ❌ | ❌ | ✅ | ✅ | -| User themable | ❌ | ✅ | ❌ | ❌ | ✅[3] | ✅ | -| Unusual logo choice | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| Feature | Whoogle [1] | Araa-Search | LibreY | 4get | SearchXNG | *QGato* | +| :------------------------- | ------------- | ------------------------- | ------------------------ | ------------------------ | ------------------------- | --------------------------------------- | +| Works without JavaScript | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Music search | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | +| Torrent search | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ | +| API | ❌ | ❌ [2] | ✅ | ✅ | ✅ | ✅ | +| Scalable | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | +| Not Resource Hungry | ❓ Moderate | ❌ Very resource hungry | ❌ Moderate 200-400mb~ | ❌ Moderate 200-400mb~ | ❌ Moderate 200-300MiB~ | ✅ about 15-30MiB even when searching | +| Result caching | ❓ | ❓ | ❓ | ❓ | ❓ | ✅ | +| Dynamic Page Loading | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | +| User themable | ❌ | ✅ | ❌ | ❌ | ❓[3] | ✅ | +| Unusual logo choice | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | [1]: I was not able to check this since their site does not work, same for the community instances. -[2]: In the project repo they specify that it has API, but It looks like they are no loger supporting it. Or just removed "API" button and documentation, since I was not able to find it anymore. +[2]: In the project repo they specify that it has API, but It looks like they are no longer supporting it. Or just removed "API" button and documentation, since I was not able to find it anymore. [3]: It is called 'User Themable' because you want to give the user freedom of choice for their theme, not by hard-setting one theme in the backend and calling it themable. @@ -48,7 +48,7 @@ A self-hosted private ### For Self-Hosting - **Self-hosted option** - Run on your own server for even more privacy. -- **Lightweight** - Low memory footprint (15-22MiB) even during searches. +- **Lightweight** - Low memory footprint (15-30MiB) even during searches. - **Decentralized** - No single point of failure. - **Results caching in RAM** - Faster response times through caching. - **Configurable** - Tweak features via `config.ini`. @@ -67,7 +67,7 @@ A self-hosted private ### Prerequisites -- Go (version 1.18 or higher recommended) +- Go (version 1.23 or higher recommended) - Git (unexpected) - Access to the internet for fetching results (even more unexpected) diff --git a/crawler-visited.go b/crawler-visited.go new file mode 100644 index 0000000..bfa1af9 --- /dev/null +++ b/crawler-visited.go @@ -0,0 +1,106 @@ +package main + +import ( + "bufio" + "fmt" + "os" + "sync" +) + +// VisitedStore handles deduplicating visited URLs with a map and a periodic flush to disk. +type VisitedStore struct { + mu sync.Mutex + visited map[string]bool + toFlush []string + + filePath string + batchSize int // how many new URLs we batch before flushing +} + +// NewVisitedStore creates or loads the visited URLs from filePath. +func NewVisitedStore(filePath string, batchSize int) (*VisitedStore, error) { + store := &VisitedStore{ + visited: make(map[string]bool), + filePath: filePath, + batchSize: batchSize, + } + + // Attempt to load existing visited URLs (if file exists). + if _, err := os.Stat(filePath); err == nil { + if err := store.loadFromFile(); err != nil { + return nil, fmt.Errorf("loadFromFile error: %w", err) + } + } + return store, nil +} + +// loadFromFile loads visited URLs from the store’s file. One URL per line. +func (s *VisitedStore) loadFromFile() error { + f, err := os.Open(s.filePath) + if err != nil { + return err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + url := scanner.Text() + s.visited[url] = true + } + return scanner.Err() +} + +// AlreadyVisited returns true if the URL is in the store. +func (s *VisitedStore) AlreadyVisited(url string) bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.visited[url] +} + +// MarkVisited adds the URL to the store if not already present, and triggers a flush if batchSize is reached. +func (s *VisitedStore) MarkVisited(url string) (added bool, err error) { + s.mu.Lock() + defer s.mu.Unlock() + + if s.visited[url] { + return false, nil + } + // Mark in memory + s.visited[url] = true + s.toFlush = append(s.toFlush, url) + + // Flush if we have enough new URLs + if len(s.toFlush) >= s.batchSize { + if err := s.flushToFileUnlocked(); err != nil { + return false, err + } + } + return true, nil +} + +// Flush everything in s.toFlush to file, then clear the buffer. +func (s *VisitedStore) Flush() error { + s.mu.Lock() + defer s.mu.Unlock() + return s.flushToFileUnlocked() +} + +// flushToFileUnlocked writes s.toFlush lines to the store file, then clears s.toFlush. +func (s *VisitedStore) flushToFileUnlocked() error { + if len(s.toFlush) == 0 { + return nil + } + f, err := os.OpenFile(s.filePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + return err + } + defer f.Close() + + for _, url := range s.toFlush { + if _, err := fmt.Fprintln(f, url); err != nil { + return err + } + } + s.toFlush = nil + return nil +} diff --git a/crawler.go b/crawler.go index 45dc76f..3ddc36b 100644 --- a/crawler.go +++ b/crawler.go @@ -10,13 +10,24 @@ import ( "time" ) +// Create a global or config-level visited store +var visitedStore *VisitedStore + // webCrawlerInit is called during init on program start func webCrawlerInit() { + // Initialize the store with, say, batchSize=50 + store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), 50) + if err != nil { + printErr("Failed to initialize visited store: %v", err) + } + visitedStore = store + + // Start the periodic crawler go func() { // First run immediately runCrawlerAndIndexer() - // Then run periodically based on CrawlingInterval + // Then run periodically ticker := time.NewTicker(config.CrawlingInterval) for range ticker.C { runCrawlerAndIndexer() @@ -79,25 +90,8 @@ func readDomainsCSV(csvPath string) ([][2]string, error) { // 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh // 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error { - existingEntries := make(map[string]bool) - var mu sync.Mutex // For existingEntries + file writes - // read existing entries from outFile if it exists - if _, err := os.Stat(outFile); err == nil { - file, err := os.Open(outFile) - if err != nil { - return fmt.Errorf("unable to open %s: %v", outFile, err) - } - defer file.Close() - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := scanner.Text() - parts := strings.SplitN(line, "|", 5) - if len(parts) >= 1 { - existingEntries[parts[0]] = true - } - } - } + var mu sync.Mutex // Open file for writing (truncate if existing) file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) @@ -119,33 +113,38 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error for dom := range standardCh { rank := dom[0] domainName := dom[1] - fullURL := "https://" + domainName - - // Mark domain existing so we don't re-crawl duplicates - mu.Lock() - if domainName == "" || existingEntries[fullURL] { - mu.Unlock() + if domainName == "" { continue } - existingEntries[fullURL] = true - mu.Unlock() + fullURL := "https://" + domainName - // get a standard user agent + // 1. Check if we've already visited this URL + added, err := visitedStore.MarkVisited(fullURL) + if err != nil { + printErr("MarkVisited error for %s: %v", fullURL, err) + continue + } + if !added { + // Already visited + continue + } + + // 2. Standard extraction userAgent, _ := GetUserAgent("crawler-std") title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent) + // If missing, push to Chrome queue if title == "" || desc == "" { - // push to chromeCh chromeCh <- dom continue } - // write to file + // 3. Write to file line := fmt.Sprintf("%s|%s|%s|%s|%s\n", fullURL, title, keywords, desc, rank) mu.Lock() - file.WriteString(line) + _, _ = file.WriteString(line) mu.Unlock() } }() @@ -160,32 +159,32 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error for dom := range chromeCh { rank := dom[0] domainName := dom[1] - fullURL := "https://" + domainName - - // Mark domain existing if not already - mu.Lock() - if domainName == "" || existingEntries[fullURL] { - mu.Unlock() + if domainName == "" { continue } - existingEntries[fullURL] = true - mu.Unlock() + fullURL := "https://" + domainName - // get a chrome user agent + // We already marked it visited in the standard pass + // but you may re-check if you prefer: + // + // added, err := visitedStore.MarkVisited(fullURL) + // if err != nil { ... } + // if !added { continue } + + // 3. Chromedp fallback extraction userAgent, _ := GetUserAgent("crawler-chrome") title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent) - if title == "" || desc == "" { printWarn("Skipping (Chrome) %s: missing title/desc", fullURL) continue } - // write to file + // 4. Write to file line := fmt.Sprintf("%s|%s|%s|%s|%s\n", fullURL, title, keywords, desc, rank) mu.Lock() - file.WriteString(line) + _, _ = file.WriteString(line) mu.Unlock() } }() @@ -194,7 +193,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error // Feed domains into standardCh go func() { for _, dom := range domains { - // optionally, if maxPages is relevant, you can track how many have been processed standardCh <- dom } // close the standardCh once all are queued @@ -210,5 +208,12 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error // Wait for chrome workers to finish wgChrome.Wait() + // Optionally flush the visited store once more + if visitedStore != nil { + if err := visitedStore.Flush(); err != nil { + printErr("visitedStore flush error: %v", err) + } + } + return nil } diff --git a/go.mod b/go.mod index c8200d3..f7d89ad 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,8 @@ require ( require ( github.com/blevesearch/bleve/v2 v2.4.4 + github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb + github.com/chromedp/chromedp v0.11.2 github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f golang.org/x/net v0.33.0 ) @@ -41,8 +43,6 @@ require ( github.com/blevesearch/zapx/v14 v14.3.10 // indirect github.com/blevesearch/zapx/v15 v15.3.17 // indirect github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect - github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb // indirect - github.com/chromedp/chromedp v0.11.2 // indirect github.com/chromedp/sysutil v1.1.0 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect diff --git a/go.sum b/go.sum index 148146f..66cede6 100644 --- a/go.sum +++ b/go.sum @@ -84,6 +84,8 @@ github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8Hm github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= @@ -96,6 +98,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=