added visited sites functionality to crawler

2025-01-01 23:48:47 +01:00 · 2025-01-01 23:48:47 +01:00 · 918e1823df
commit 918e1823df
parent c71808aa1e
5 changed files with 178 additions and 63 deletions
--- a/crawler.go
+++ b/crawler.go
@ -10,13 +10,24 @@ import (
 	"time"
 )

+// Create a global or config-level visited store
+var visitedStore *VisitedStore
+
 // webCrawlerInit is called during init on program start
 func webCrawlerInit() {
+	// Initialize the store with, say, batchSize=50
+	store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), 50)
+	if err != nil {
+		printErr("Failed to initialize visited store: %v", err)
+	}
+	visitedStore = store
+
+	// Start the periodic crawler
 	go func() {
 		// First run immediately
 		runCrawlerAndIndexer()

-		// Then run periodically based on CrawlingInterval
+		// Then run periodically
 		ticker := time.NewTicker(config.CrawlingInterval)
 		for range ticker.C {
 			runCrawlerAndIndexer()
@ -79,25 +90,8 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
 //  1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
 //  2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
 func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
-	existingEntries := make(map[string]bool)
-	var mu sync.Mutex // For existingEntries + file writes

-	// read existing entries from outFile if it exists
-	if _, err := os.Stat(outFile); err == nil {
-		file, err := os.Open(outFile)
-		if err != nil {
-			return fmt.Errorf("unable to open %s: %v", outFile, err)
-		}
-		defer file.Close()
-		scanner := bufio.NewScanner(file)
-		for scanner.Scan() {
-			line := scanner.Text()
-			parts := strings.SplitN(line, "|", 5)
-			if len(parts) >= 1 {
-				existingEntries[parts[0]] = true
-			}
-		}
-	}
+	var mu sync.Mutex

 	// Open file for writing (truncate if existing)
 	file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
@ -119,33 +113,38 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 			for dom := range standardCh {
 				rank := dom[0]
 				domainName := dom[1]
-				fullURL := "https://" + domainName
-
-				// Mark domain existing so we don't re-crawl duplicates
-				mu.Lock()
-				if domainName == "" || existingEntries[fullURL] {
-					mu.Unlock()
+				if domainName == "" {
 					continue
 				}
-				existingEntries[fullURL] = true
-				mu.Unlock()
+				fullURL := "https://" + domainName

-				// get a standard user agent
+				// 1. Check if we've already visited this URL
+				added, err := visitedStore.MarkVisited(fullURL)
+				if err != nil {
+					printErr("MarkVisited error for %s: %v", fullURL, err)
+					continue
+				}
+				if !added {
+					// Already visited
+					continue
+				}
+
+				// 2. Standard extraction
 				userAgent, _ := GetUserAgent("crawler-std")
 				title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)

+				// If missing, push to Chrome queue
 				if title == "" || desc == "" {
-					// push to chromeCh
 					chromeCh <- dom
 					continue
 				}

-				// write to file
+				// 3. Write to file
 				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
 					fullURL, title, keywords, desc, rank)

 				mu.Lock()
-				file.WriteString(line)
+				_, _ = file.WriteString(line)
 				mu.Unlock()
 			}
 		}()
@ -160,32 +159,32 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 			for dom := range chromeCh {
 				rank := dom[0]
 				domainName := dom[1]
-				fullURL := "https://" + domainName
-
-				// Mark domain existing if not already
-				mu.Lock()
-				if domainName == "" || existingEntries[fullURL] {
-					mu.Unlock()
+				if domainName == "" {
 					continue
 				}
-				existingEntries[fullURL] = true
-				mu.Unlock()
+				fullURL := "https://" + domainName

-				// get a chrome user agent
+				// We already marked it visited in the standard pass
+				// but you may re-check if you prefer:
+				//
+				// added, err := visitedStore.MarkVisited(fullURL)
+				// if err != nil { ... }
+				// if !added { continue }
+
+				// 3. Chromedp fallback extraction
 				userAgent, _ := GetUserAgent("crawler-chrome")
 				title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
-
 				if title == "" || desc == "" {
 					printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
 					continue
 				}

-				// write to file
+				// 4. Write to file
 				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
 					fullURL, title, keywords, desc, rank)

 				mu.Lock()
-				file.WriteString(line)
+				_, _ = file.WriteString(line)
 				mu.Unlock()
 			}
 		}()
@ -194,7 +193,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 	// Feed domains into standardCh
 	go func() {
 		for _, dom := range domains {
-			// optionally, if maxPages is relevant, you can track how many have been processed
 			standardCh <- dom
 		}
 		// close the standardCh once all are queued
@ -210,5 +208,12 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
 	// Wait for chrome workers to finish
 	wgChrome.Wait()

+	// Optionally flush the visited store once more
+	if visitedStore != nil {
+		if err := visitedStore.Flush(); err != nil {
+			printErr("visitedStore flush error: %v", err)
+		}
+	}
+
 	return nil
 }