improved crawler data extraction (added chromedp)

2025-01-01 14:50:12 +01:00 · 2025-01-01 14:50:12 +01:00 · c71808aa1e
commit c71808aa1e
parent 3494457336
6 changed files with 305 additions and 166 deletions
--- a/crawler.go
+++ b/crawler.go
@ -35,7 +35,7 @@ func runCrawlerAndIndexer() {

 	// 2. Crawl each domain and write results to data_to_index.txt
 	outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
-	if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
+	if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
 		printErr("Error crawling domains: %v", err)
 		return
 	}
@ -75,18 +75,20 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
 	return result, scanner.Err()
 }

-// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
-func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
+// crawlDomainsToFile does an async pipeline:
+//  1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
+//  2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
+func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
 	existingEntries := make(map[string]bool)
-	var mu sync.Mutex // Mutex to protect access to the map
+	var mu sync.Mutex // For existingEntries + file writes

+	// read existing entries from outFile if it exists
 	if _, err := os.Stat(outFile); err == nil {
 		file, err := os.Open(outFile)
 		if err != nil {
 			return fmt.Errorf("unable to open %s: %v", outFile, err)
 		}
 		defer file.Close()
-
 		scanner := bufio.NewScanner(file)
 		for scanner.Scan() {
 			line := scanner.Text()
@ -104,47 +106,109 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
 	}
 	defer file.Close()

-	semaphore := make(chan struct{}, concurrentCrawlers)
-	var wg sync.WaitGroup
+	// Prepare channels
+	standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
+	chromeCh := make(chan [2]string, 1000)

-	for _, d := range domains {
-		wg.Add(1)
-		semaphore <- struct{}{}
-		go func(domain [2]string) {
-			defer wg.Done()
-			defer func() { <-semaphore }()
+	// 1) Spawn standard workers
+	var wgStandard sync.WaitGroup
+	for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
+		wgStandard.Add(1)
+		go func() {
+			defer wgStandard.Done()
+			for dom := range standardCh {
+				rank := dom[0]
+				domainName := dom[1]
+				fullURL := "https://" + domainName

-			rank := domain[0]
-			domainName := domain[1]
-			fullURL := "https://" + domainName
-
-			mu.Lock()
-			if domainName == "" || existingEntries[fullURL] {
+				// Mark domain existing so we don't re-crawl duplicates
+				mu.Lock()
+				if domainName == "" || existingEntries[fullURL] {
+					mu.Unlock()
+					continue
+				}
+				existingEntries[fullURL] = true
+				mu.Unlock()
+
+				// get a standard user agent
+				userAgent, _ := GetUserAgent("crawler-std")
+				title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
+
+				if title == "" || desc == "" {
+					// push to chromeCh
+					chromeCh <- dom
+					continue
+				}
+
+				// write to file
+				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
+					fullURL, title, keywords, desc, rank)
+
+				mu.Lock()
+				file.WriteString(line)
 				mu.Unlock()
-				return
 			}
-			existingEntries[fullURL] = true
-			mu.Unlock()
-
-			title, desc, keywords := fetchPageMetadata(fullURL)
-
-			// Skip saving if title or description is missing
-			if title == "" || desc == "" {
-				printDebug("Skipping %s: missing title or description", fullURL)
-				return
-			}
-
-			line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
-				fullURL,
-				title,
-				keywords,
-				desc,
-				rank,
-			)
-			file.WriteString(line)
-		}(d)
+		}()
 	}

-	wg.Wait()
+	// 2) Spawn chrome workers
+	var wgChrome sync.WaitGroup
+	for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
+		wgChrome.Add(1)
+		go func() {
+			defer wgChrome.Done()
+			for dom := range chromeCh {
+				rank := dom[0]
+				domainName := dom[1]
+				fullURL := "https://" + domainName
+
+				// Mark domain existing if not already
+				mu.Lock()
+				if domainName == "" || existingEntries[fullURL] {
+					mu.Unlock()
+					continue
+				}
+				existingEntries[fullURL] = true
+				mu.Unlock()
+
+				// get a chrome user agent
+				userAgent, _ := GetUserAgent("crawler-chrome")
+				title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
+
+				if title == "" || desc == "" {
+					printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
+					continue
+				}
+
+				// write to file
+				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
+					fullURL, title, keywords, desc, rank)
+
+				mu.Lock()
+				file.WriteString(line)
+				mu.Unlock()
+			}
+		}()
+	}
+
+	// Feed domains into standardCh
+	go func() {
+		for _, dom := range domains {
+			// optionally, if maxPages is relevant, you can track how many have been processed
+			standardCh <- dom
+		}
+		// close the standardCh once all are queued
+		close(standardCh)
+	}()
+
+	// Wait for standard workers to finish, then close chromeCh
+	go func() {
+		wgStandard.Wait()
+		close(chromeCh)
+	}()
+
+	// Wait for chrome workers to finish
+	wgChrome.Wait()
+
 	return nil
 }