2024-12-29 22:54:55 +01:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
2024-12-30 17:19:20 +01:00
|
|
|
"sync"
|
2024-12-29 22:54:55 +01:00
|
|
|
"time"
|
|
|
|
)
|
|
|
|
|
|
|
|
// webCrawlerInit is called during init on program start
|
|
|
|
func webCrawlerInit() {
|
|
|
|
go func() {
|
|
|
|
// First run immediately
|
|
|
|
runCrawlerAndIndexer()
|
|
|
|
|
2024-12-30 17:19:20 +01:00
|
|
|
// Then run periodically based on CrawlingInterval
|
|
|
|
ticker := time.NewTicker(config.CrawlingInterval)
|
2024-12-29 22:54:55 +01:00
|
|
|
for range ticker.C {
|
|
|
|
runCrawlerAndIndexer()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
|
|
|
|
// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes
|
|
|
|
func runCrawlerAndIndexer() {
|
|
|
|
// 1. Read domains.csv
|
|
|
|
domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))
|
|
|
|
if err != nil {
|
|
|
|
printErr("Error reading domains.csv: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// 2. Crawl each domain and write results to data_to_index.txt
|
|
|
|
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
2025-01-01 14:50:12 +01:00
|
|
|
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
|
2024-12-29 22:54:55 +01:00
|
|
|
printErr("Error crawling domains: %v", err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2024-12-30 17:19:20 +01:00
|
|
|
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
|
|
|
|
startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
2024-12-29 22:54:55 +01:00
|
|
|
|
|
|
|
printDebug("Crawl + index refresh completed.")
|
|
|
|
}
|
|
|
|
|
|
|
|
// readDomainsCSV returns a slice of (rank,domain) from a local CSV file
|
|
|
|
func readDomainsCSV(csvPath string) ([][2]string, error) {
|
|
|
|
f, err := os.Open(csvPath)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
var result [][2]string
|
|
|
|
scanner := bufio.NewScanner(f)
|
|
|
|
// Skip header line
|
|
|
|
scanner.Scan()
|
|
|
|
|
|
|
|
for scanner.Scan() {
|
|
|
|
line := scanner.Text()
|
|
|
|
// Split by commas, not tabs
|
|
|
|
fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)
|
|
|
|
if len(fields) < 2 {
|
|
|
|
printDebug("Skipping malformed line: %s", line)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Remove quotes around fields, if present
|
|
|
|
rank := strings.Trim(fields[0], `"`)
|
|
|
|
domain := strings.Trim(fields[1], `"`)
|
|
|
|
result = append(result, [2]string{rank, domain})
|
|
|
|
}
|
|
|
|
return result, scanner.Err()
|
|
|
|
}
|
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// crawlDomainsToFile does an async pipeline:
|
|
|
|
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
|
|
|
|
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
|
|
|
|
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
|
2024-12-29 22:54:55 +01:00
|
|
|
existingEntries := make(map[string]bool)
|
2025-01-01 14:50:12 +01:00
|
|
|
var mu sync.Mutex // For existingEntries + file writes
|
2024-12-30 17:19:20 +01:00
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// read existing entries from outFile if it exists
|
2024-12-30 17:19:20 +01:00
|
|
|
if _, err := os.Stat(outFile); err == nil {
|
2024-12-29 22:54:55 +01:00
|
|
|
file, err := os.Open(outFile)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to open %s: %v", outFile, err)
|
|
|
|
}
|
|
|
|
defer file.Close()
|
|
|
|
scanner := bufio.NewScanner(file)
|
|
|
|
for scanner.Scan() {
|
|
|
|
line := scanner.Text()
|
|
|
|
parts := strings.SplitN(line, "|", 5)
|
|
|
|
if len(parts) >= 1 {
|
2024-12-30 17:19:20 +01:00
|
|
|
existingEntries[parts[0]] = true
|
2024-12-29 22:54:55 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Open file for writing (truncate if existing)
|
|
|
|
file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
|
|
|
|
}
|
|
|
|
defer file.Close()
|
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// Prepare channels
|
|
|
|
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
|
|
|
|
chromeCh := make(chan [2]string, 1000)
|
|
|
|
|
|
|
|
// 1) Spawn standard workers
|
|
|
|
var wgStandard sync.WaitGroup
|
|
|
|
for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
|
|
|
|
wgStandard.Add(1)
|
|
|
|
go func() {
|
|
|
|
defer wgStandard.Done()
|
|
|
|
for dom := range standardCh {
|
|
|
|
rank := dom[0]
|
|
|
|
domainName := dom[1]
|
|
|
|
fullURL := "https://" + domainName
|
|
|
|
|
|
|
|
// Mark domain existing so we don't re-crawl duplicates
|
|
|
|
mu.Lock()
|
|
|
|
if domainName == "" || existingEntries[fullURL] {
|
|
|
|
mu.Unlock()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
existingEntries[fullURL] = true
|
|
|
|
mu.Unlock()
|
|
|
|
|
|
|
|
// get a standard user agent
|
|
|
|
userAgent, _ := GetUserAgent("crawler-std")
|
|
|
|
title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
|
2024-12-29 22:54:55 +01:00
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
if title == "" || desc == "" {
|
|
|
|
// push to chromeCh
|
|
|
|
chromeCh <- dom
|
|
|
|
continue
|
|
|
|
}
|
2024-12-30 17:19:20 +01:00
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// write to file
|
|
|
|
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
|
|
|
fullURL, title, keywords, desc, rank)
|
2024-12-30 17:19:20 +01:00
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
mu.Lock()
|
|
|
|
file.WriteString(line)
|
2024-12-30 17:19:20 +01:00
|
|
|
mu.Unlock()
|
|
|
|
}
|
2025-01-01 14:50:12 +01:00
|
|
|
}()
|
|
|
|
}
|
2024-12-29 22:54:55 +01:00
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// 2) Spawn chrome workers
|
|
|
|
var wgChrome sync.WaitGroup
|
|
|
|
for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
|
|
|
|
wgChrome.Add(1)
|
|
|
|
go func() {
|
|
|
|
defer wgChrome.Done()
|
|
|
|
for dom := range chromeCh {
|
|
|
|
rank := dom[0]
|
|
|
|
domainName := dom[1]
|
|
|
|
fullURL := "https://" + domainName
|
|
|
|
|
|
|
|
// Mark domain existing if not already
|
|
|
|
mu.Lock()
|
|
|
|
if domainName == "" || existingEntries[fullURL] {
|
|
|
|
mu.Unlock()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
existingEntries[fullURL] = true
|
|
|
|
mu.Unlock()
|
2025-01-01 13:49:16 +01:00
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// get a chrome user agent
|
|
|
|
userAgent, _ := GetUserAgent("crawler-chrome")
|
|
|
|
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
|
|
|
|
|
|
|
|
if title == "" || desc == "" {
|
|
|
|
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
|
|
|
|
continue
|
|
|
|
}
|
2024-12-29 22:54:55 +01:00
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// write to file
|
|
|
|
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
|
|
|
fullURL, title, keywords, desc, rank)
|
|
|
|
|
|
|
|
mu.Lock()
|
|
|
|
file.WriteString(line)
|
|
|
|
mu.Unlock()
|
|
|
|
}
|
|
|
|
}()
|
2024-12-29 22:54:55 +01:00
|
|
|
}
|
|
|
|
|
2025-01-01 14:50:12 +01:00
|
|
|
// Feed domains into standardCh
|
|
|
|
go func() {
|
|
|
|
for _, dom := range domains {
|
|
|
|
// optionally, if maxPages is relevant, you can track how many have been processed
|
|
|
|
standardCh <- dom
|
|
|
|
}
|
|
|
|
// close the standardCh once all are queued
|
|
|
|
close(standardCh)
|
|
|
|
}()
|
|
|
|
|
|
|
|
// Wait for standard workers to finish, then close chromeCh
|
|
|
|
go func() {
|
|
|
|
wgStandard.Wait()
|
|
|
|
close(chromeCh)
|
|
|
|
}()
|
|
|
|
|
|
|
|
// Wait for chrome workers to finish
|
|
|
|
wgChrome.Wait()
|
|
|
|
|
2024-12-29 22:54:55 +01:00
|
|
|
return nil
|
|
|
|
}
|