improved crawler data extraction (added chromedp)

This commit is contained in:
partisan 2025-01-01 14:50:12 +01:00
parent 3494457336
commit c71808aa1e
6 changed files with 305 additions and 166 deletions

View file

@ -35,7 +35,7 @@ func runCrawlerAndIndexer() {
// 2. Crawl each domain and write results to data_to_index.txt
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
printErr("Error crawling domains: %v", err)
return
}
@ -75,18 +75,20 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
return result, scanner.Err()
}
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
// crawlDomainsToFile does an async pipeline:
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
existingEntries := make(map[string]bool)
var mu sync.Mutex // Mutex to protect access to the map
var mu sync.Mutex // For existingEntries + file writes
// read existing entries from outFile if it exists
if _, err := os.Stat(outFile); err == nil {
file, err := os.Open(outFile)
if err != nil {
return fmt.Errorf("unable to open %s: %v", outFile, err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
@ -104,47 +106,109 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
}
defer file.Close()
semaphore := make(chan struct{}, concurrentCrawlers)
var wg sync.WaitGroup
// Prepare channels
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
chromeCh := make(chan [2]string, 1000)
for _, d := range domains {
wg.Add(1)
semaphore <- struct{}{}
go func(domain [2]string) {
defer wg.Done()
defer func() { <-semaphore }()
// 1) Spawn standard workers
var wgStandard sync.WaitGroup
for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
wgStandard.Add(1)
go func() {
defer wgStandard.Done()
for dom := range standardCh {
rank := dom[0]
domainName := dom[1]
fullURL := "https://" + domainName
rank := domain[0]
domainName := domain[1]
fullURL := "https://" + domainName
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
// Mark domain existing so we don't re-crawl duplicates
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
mu.Unlock()
continue
}
existingEntries[fullURL] = true
mu.Unlock()
// get a standard user agent
userAgent, _ := GetUserAgent("crawler-std")
title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
if title == "" || desc == "" {
// push to chromeCh
chromeCh <- dom
continue
}
// write to file
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL, title, keywords, desc, rank)
mu.Lock()
file.WriteString(line)
mu.Unlock()
return
}
existingEntries[fullURL] = true
mu.Unlock()
title, desc, keywords := fetchPageMetadata(fullURL)
// Skip saving if title or description is missing
if title == "" || desc == "" {
printDebug("Skipping %s: missing title or description", fullURL)
return
}
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL,
title,
keywords,
desc,
rank,
)
file.WriteString(line)
}(d)
}()
}
wg.Wait()
// 2) Spawn chrome workers
var wgChrome sync.WaitGroup
for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
wgChrome.Add(1)
go func() {
defer wgChrome.Done()
for dom := range chromeCh {
rank := dom[0]
domainName := dom[1]
fullURL := "https://" + domainName
// Mark domain existing if not already
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
mu.Unlock()
continue
}
existingEntries[fullURL] = true
mu.Unlock()
// get a chrome user agent
userAgent, _ := GetUserAgent("crawler-chrome")
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
if title == "" || desc == "" {
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
continue
}
// write to file
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL, title, keywords, desc, rank)
mu.Lock()
file.WriteString(line)
mu.Unlock()
}
}()
}
// Feed domains into standardCh
go func() {
for _, dom := range domains {
// optionally, if maxPages is relevant, you can track how many have been processed
standardCh <- dom
}
// close the standardCh once all are queued
close(standardCh)
}()
// Wait for standard workers to finish, then close chromeCh
go func() {
wgStandard.Wait()
close(chromeCh)
}()
// Wait for chrome workers to finish
wgChrome.Wait()
return nil
}