improved crawler data extraction (added chromedp)
This commit is contained in:
parent
3494457336
commit
c71808aa1e
6 changed files with 305 additions and 166 deletions
146
crawler.go
146
crawler.go
|
@ -35,7 +35,7 @@ func runCrawlerAndIndexer() {
|
|||
|
||||
// 2. Crawl each domain and write results to data_to_index.txt
|
||||
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
||||
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
|
||||
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
|
||||
printErr("Error crawling domains: %v", err)
|
||||
return
|
||||
}
|
||||
|
@ -75,18 +75,20 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
|
|||
return result, scanner.Err()
|
||||
}
|
||||
|
||||
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
|
||||
// crawlDomainsToFile does an async pipeline:
|
||||
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
|
||||
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
|
||||
existingEntries := make(map[string]bool)
|
||||
var mu sync.Mutex // Mutex to protect access to the map
|
||||
var mu sync.Mutex // For existingEntries + file writes
|
||||
|
||||
// read existing entries from outFile if it exists
|
||||
if _, err := os.Stat(outFile); err == nil {
|
||||
file, err := os.Open(outFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to open %s: %v", outFile, err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
@ -104,47 +106,109 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
|
|||
}
|
||||
defer file.Close()
|
||||
|
||||
semaphore := make(chan struct{}, concurrentCrawlers)
|
||||
var wg sync.WaitGroup
|
||||
// Prepare channels
|
||||
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
|
||||
chromeCh := make(chan [2]string, 1000)
|
||||
|
||||
for _, d := range domains {
|
||||
wg.Add(1)
|
||||
semaphore <- struct{}{}
|
||||
go func(domain [2]string) {
|
||||
defer wg.Done()
|
||||
defer func() { <-semaphore }()
|
||||
// 1) Spawn standard workers
|
||||
var wgStandard sync.WaitGroup
|
||||
for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
|
||||
wgStandard.Add(1)
|
||||
go func() {
|
||||
defer wgStandard.Done()
|
||||
for dom := range standardCh {
|
||||
rank := dom[0]
|
||||
domainName := dom[1]
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
rank := domain[0]
|
||||
domainName := domain[1]
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
mu.Lock()
|
||||
if domainName == "" || existingEntries[fullURL] {
|
||||
// Mark domain existing so we don't re-crawl duplicates
|
||||
mu.Lock()
|
||||
if domainName == "" || existingEntries[fullURL] {
|
||||
mu.Unlock()
|
||||
continue
|
||||
}
|
||||
existingEntries[fullURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
// get a standard user agent
|
||||
userAgent, _ := GetUserAgent("crawler-std")
|
||||
title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
|
||||
|
||||
if title == "" || desc == "" {
|
||||
// push to chromeCh
|
||||
chromeCh <- dom
|
||||
continue
|
||||
}
|
||||
|
||||
// write to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL, title, keywords, desc, rank)
|
||||
|
||||
mu.Lock()
|
||||
file.WriteString(line)
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
existingEntries[fullURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||
|
||||
// Skip saving if title or description is missing
|
||||
if title == "" || desc == "" {
|
||||
printDebug("Skipping %s: missing title or description", fullURL)
|
||||
return
|
||||
}
|
||||
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL,
|
||||
title,
|
||||
keywords,
|
||||
desc,
|
||||
rank,
|
||||
)
|
||||
file.WriteString(line)
|
||||
}(d)
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
// 2) Spawn chrome workers
|
||||
var wgChrome sync.WaitGroup
|
||||
for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
|
||||
wgChrome.Add(1)
|
||||
go func() {
|
||||
defer wgChrome.Done()
|
||||
for dom := range chromeCh {
|
||||
rank := dom[0]
|
||||
domainName := dom[1]
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// Mark domain existing if not already
|
||||
mu.Lock()
|
||||
if domainName == "" || existingEntries[fullURL] {
|
||||
mu.Unlock()
|
||||
continue
|
||||
}
|
||||
existingEntries[fullURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
// get a chrome user agent
|
||||
userAgent, _ := GetUserAgent("crawler-chrome")
|
||||
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
|
||||
|
||||
if title == "" || desc == "" {
|
||||
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
|
||||
continue
|
||||
}
|
||||
|
||||
// write to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL, title, keywords, desc, rank)
|
||||
|
||||
mu.Lock()
|
||||
file.WriteString(line)
|
||||
mu.Unlock()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Feed domains into standardCh
|
||||
go func() {
|
||||
for _, dom := range domains {
|
||||
// optionally, if maxPages is relevant, you can track how many have been processed
|
||||
standardCh <- dom
|
||||
}
|
||||
// close the standardCh once all are queued
|
||||
close(standardCh)
|
||||
}()
|
||||
|
||||
// Wait for standard workers to finish, then close chromeCh
|
||||
go func() {
|
||||
wgStandard.Wait()
|
||||
close(chromeCh)
|
||||
}()
|
||||
|
||||
// Wait for chrome workers to finish
|
||||
wgChrome.Wait()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue