changed indexing buffer to save to ram not to file

2025-01-02 12:55:44 +01:00 · 2025-01-02 12:55:44 +01:00 · 61266c461a
commit 61266c461a
parent 918e1823df
4 changed files with 155 additions and 62 deletions
--- a/crawler.go
+++ b/crawler.go
@ -2,7 +2,6 @@ package main

 import (
 	"bufio"
-	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
@ -45,14 +44,20 @@ func runCrawlerAndIndexer() {
 	}

 	// 2. Crawl each domain and write results to data_to_index.txt
-	outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
-	if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
+	if err := crawlDomainsToFile(domains, config.MaxPagesPerDomain); err != nil {
 		printErr("Error crawling domains: %v", err)
 		return
 	}

-	// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
-	startPeriodicIndexing(outFile, config.IndexRefreshInterval)
+	// After finishing crawling, flush any pending visited-urls
+	if visitedStore != nil {
+		if err := visitedStore.Flush(); err != nil {
+			printErr("Failed to flush visitedStore: %v", err)
+		}
+	}
+
+	// 3. Re-index data_to_index.txt based on IndexRefreshInterval
+	//startPeriodicIndexing(outFile, config.IndexRefreshInterval)

 	printDebug("Crawl + index refresh completed.")
 }
@ -89,16 +94,10 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
 // crawlDomainsToFile does an async pipeline:
 //  1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
 //  2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
-func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
-
-	var mu sync.Mutex
-
-	// Open file for writing (truncate if existing)
-	file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
-	if err != nil {
-		return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
-	}
-	defer file.Close()
+//
+// Now, instead of writing to a file, we directly index each result into Bleve via indexDocImmediately(...).
+func crawlDomainsToFile(domains [][2]string, maxPages int) error {
+	var mu sync.Mutex // Used if needed to protect shared data. (Here mainly for visitedStore.)

 	// Prepare channels
 	standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
@ -110,6 +109,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 		wgStandard.Add(1)
 		go func() {
 			defer wgStandard.Done()
+
 			for dom := range standardCh {
 				rank := dom[0]
 				domainName := dom[1]
@ -118,14 +118,17 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 				}
 				fullURL := "https://" + domainName

-				// 1. Check if we've already visited this URL
+				// Mark visited so we don't re-crawl duplicates
+				mu.Lock()
 				added, err := visitedStore.MarkVisited(fullURL)
+				mu.Unlock()
+
 				if err != nil {
 					printErr("MarkVisited error for %s: %v", fullURL, err)
 					continue
 				}
 				if !added {
-					// Already visited
+					// Already visited, skip
 					continue
 				}

@ -139,13 +142,11 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 					continue
 				}

-				// 3. Write to file
-				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
-					fullURL, title, keywords, desc, rank)
-
-				mu.Lock()
-				_, _ = file.WriteString(line)
-				mu.Unlock()
+				// 3. Directly index
+				err = indexDocImmediately(fullURL, title, keywords, desc, rank)
+				if err != nil {
+					printErr("Index error for %s: %v", fullURL, err)
+				}
 			}
 		}()
 	}
@ -156,6 +157,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 		wgChrome.Add(1)
 		go func() {
 			defer wgChrome.Done()
+
 			for dom := range chromeCh {
 				rank := dom[0]
 				domainName := dom[1]
@ -164,28 +166,19 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 				}
 				fullURL := "https://" + domainName

-				// We already marked it visited in the standard pass
-				// but you may re-check if you prefer:
-				//
-				// added, err := visitedStore.MarkVisited(fullURL)
-				// if err != nil { ... }
-				// if !added { continue }
-
 				// 3. Chromedp fallback extraction
 				userAgent, _ := GetUserAgent("crawler-chrome")
 				title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
 				if title == "" || desc == "" {
-					printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
+					printWarn("Skipping %s: unable to get title/desc data", fullURL)
 					continue
 				}

-				// 4. Write to file
-				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
-					fullURL, title, keywords, desc, rank)
-
-				mu.Lock()
-				_, _ = file.WriteString(line)
-				mu.Unlock()
+				// 4. Directly index the doc
+				err := indexDocImmediately(fullURL, title, keywords, desc, rank)
+				if err != nil {
+					printErr("Index error for %s: %v", fullURL, err)
+				}
 			}
 		}()
 	}
@ -195,7 +188,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 		for _, dom := range domains {
 			standardCh <- dom
 		}
-		// close the standardCh once all are queued
 		close(standardCh)
 	}()

@ -208,7 +200,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 	// Wait for chrome workers to finish
 	wgChrome.Wait()

-	// Optionally flush the visited store once more
+	// Flush visitedStore
 	if visitedStore != nil {
 		if err := visitedStore.Flush(); err != nil {
 			printErr("visitedStore flush error: %v", err)