added visited sites functionality to crawler
This commit is contained in:
parent
c71808aa1e
commit
918e1823df
5 changed files with 178 additions and 63 deletions
93
crawler.go
93
crawler.go
|
@ -10,13 +10,24 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
// Create a global or config-level visited store
|
||||
var visitedStore *VisitedStore
|
||||
|
||||
// webCrawlerInit is called during init on program start
|
||||
func webCrawlerInit() {
|
||||
// Initialize the store with, say, batchSize=50
|
||||
store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), 50)
|
||||
if err != nil {
|
||||
printErr("Failed to initialize visited store: %v", err)
|
||||
}
|
||||
visitedStore = store
|
||||
|
||||
// Start the periodic crawler
|
||||
go func() {
|
||||
// First run immediately
|
||||
runCrawlerAndIndexer()
|
||||
|
||||
// Then run periodically based on CrawlingInterval
|
||||
// Then run periodically
|
||||
ticker := time.NewTicker(config.CrawlingInterval)
|
||||
for range ticker.C {
|
||||
runCrawlerAndIndexer()
|
||||
|
@ -79,25 +90,8 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
|
|||
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
|
||||
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
|
||||
existingEntries := make(map[string]bool)
|
||||
var mu sync.Mutex // For existingEntries + file writes
|
||||
|
||||
// read existing entries from outFile if it exists
|
||||
if _, err := os.Stat(outFile); err == nil {
|
||||
file, err := os.Open(outFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to open %s: %v", outFile, err)
|
||||
}
|
||||
defer file.Close()
|
||||
scanner := bufio.NewScanner(file)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
parts := strings.SplitN(line, "|", 5)
|
||||
if len(parts) >= 1 {
|
||||
existingEntries[parts[0]] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
var mu sync.Mutex
|
||||
|
||||
// Open file for writing (truncate if existing)
|
||||
file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
|
||||
|
@ -119,33 +113,38 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
for dom := range standardCh {
|
||||
rank := dom[0]
|
||||
domainName := dom[1]
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// Mark domain existing so we don't re-crawl duplicates
|
||||
mu.Lock()
|
||||
if domainName == "" || existingEntries[fullURL] {
|
||||
mu.Unlock()
|
||||
if domainName == "" {
|
||||
continue
|
||||
}
|
||||
existingEntries[fullURL] = true
|
||||
mu.Unlock()
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// get a standard user agent
|
||||
// 1. Check if we've already visited this URL
|
||||
added, err := visitedStore.MarkVisited(fullURL)
|
||||
if err != nil {
|
||||
printErr("MarkVisited error for %s: %v", fullURL, err)
|
||||
continue
|
||||
}
|
||||
if !added {
|
||||
// Already visited
|
||||
continue
|
||||
}
|
||||
|
||||
// 2. Standard extraction
|
||||
userAgent, _ := GetUserAgent("crawler-std")
|
||||
title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
|
||||
|
||||
// If missing, push to Chrome queue
|
||||
if title == "" || desc == "" {
|
||||
// push to chromeCh
|
||||
chromeCh <- dom
|
||||
continue
|
||||
}
|
||||
|
||||
// write to file
|
||||
// 3. Write to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL, title, keywords, desc, rank)
|
||||
|
||||
mu.Lock()
|
||||
file.WriteString(line)
|
||||
_, _ = file.WriteString(line)
|
||||
mu.Unlock()
|
||||
}
|
||||
}()
|
||||
|
@ -160,32 +159,32 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
for dom := range chromeCh {
|
||||
rank := dom[0]
|
||||
domainName := dom[1]
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// Mark domain existing if not already
|
||||
mu.Lock()
|
||||
if domainName == "" || existingEntries[fullURL] {
|
||||
mu.Unlock()
|
||||
if domainName == "" {
|
||||
continue
|
||||
}
|
||||
existingEntries[fullURL] = true
|
||||
mu.Unlock()
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// get a chrome user agent
|
||||
// We already marked it visited in the standard pass
|
||||
// but you may re-check if you prefer:
|
||||
//
|
||||
// added, err := visitedStore.MarkVisited(fullURL)
|
||||
// if err != nil { ... }
|
||||
// if !added { continue }
|
||||
|
||||
// 3. Chromedp fallback extraction
|
||||
userAgent, _ := GetUserAgent("crawler-chrome")
|
||||
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
|
||||
|
||||
if title == "" || desc == "" {
|
||||
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
|
||||
continue
|
||||
}
|
||||
|
||||
// write to file
|
||||
// 4. Write to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL, title, keywords, desc, rank)
|
||||
|
||||
mu.Lock()
|
||||
file.WriteString(line)
|
||||
_, _ = file.WriteString(line)
|
||||
mu.Unlock()
|
||||
}
|
||||
}()
|
||||
|
@ -194,7 +193,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
// Feed domains into standardCh
|
||||
go func() {
|
||||
for _, dom := range domains {
|
||||
// optionally, if maxPages is relevant, you can track how many have been processed
|
||||
standardCh <- dom
|
||||
}
|
||||
// close the standardCh once all are queued
|
||||
|
@ -210,5 +208,12 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
// Wait for chrome workers to finish
|
||||
wgChrome.Wait()
|
||||
|
||||
// Optionally flush the visited store once more
|
||||
if visitedStore != nil {
|
||||
if err := visitedStore.Flush(); err != nil {
|
||||
printErr("visitedStore flush error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue