added more config values for indexing + fixed value handling when Its missing in config file
This commit is contained in:
parent
047cccd19f
commit
13e1d6119b
3 changed files with 153 additions and 94 deletions
84
crawler.go
84
crawler.go
|
@ -7,6 +7,7 @@ import (
|
|||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
|
@ -18,8 +19,8 @@ func webCrawlerInit() {
|
|||
// First run immediately
|
||||
runCrawlerAndIndexer()
|
||||
|
||||
// Then every 24h (adjust as needed)
|
||||
ticker := time.NewTicker(24 * time.Hour)
|
||||
// Then run periodically based on CrawlingInterval
|
||||
ticker := time.NewTicker(config.CrawlingInterval)
|
||||
for range ticker.C {
|
||||
runCrawlerAndIndexer()
|
||||
}
|
||||
|
@ -37,16 +38,13 @@ func runCrawlerAndIndexer() {
|
|||
|
||||
// 2. Crawl each domain and write results to data_to_index.txt
|
||||
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
||||
if err := crawlDomainsToFile(domains, outFile); err != nil {
|
||||
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
|
||||
printErr("Error crawling domains: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// 3. Re-index data_to_index.txt
|
||||
if err := IndexFile(outFile); err != nil {
|
||||
printErr("Error indexing data_to_index.txt: %v", err)
|
||||
return
|
||||
}
|
||||
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
|
||||
startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
||||
|
||||
printDebug("Crawl + index refresh completed.")
|
||||
}
|
||||
|
@ -81,10 +79,11 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
|
|||
}
|
||||
|
||||
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
||||
// Read existing data_to_index.txt into a map to prevent duplicates
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
|
||||
existingEntries := make(map[string]bool)
|
||||
if _, err := os.Stat(outFile); err == nil { // File exists
|
||||
var mu sync.Mutex // Mutex to protect access to the map
|
||||
|
||||
if _, err := os.Stat(outFile); err == nil {
|
||||
file, err := os.Open(outFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to open %s: %v", outFile, err)
|
||||
|
@ -96,7 +95,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
|||
line := scanner.Text()
|
||||
parts := strings.SplitN(line, "|", 5)
|
||||
if len(parts) >= 1 {
|
||||
existingEntries[parts[0]] = true // Mark existing domain
|
||||
existingEntries[parts[0]] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -108,37 +107,48 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
|||
}
|
||||
defer file.Close()
|
||||
|
||||
semaphore := make(chan struct{}, concurrentCrawlers)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for _, d := range domains {
|
||||
rank := d[0]
|
||||
domain := d[1]
|
||||
if domain == "" || existingEntries["https://"+domain] {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
semaphore <- struct{}{}
|
||||
go func(domain [2]string) {
|
||||
defer wg.Done()
|
||||
defer func() { <-semaphore }()
|
||||
|
||||
fullURL := "https://" + domain
|
||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||
if title == "" {
|
||||
title = "Unknown Title"
|
||||
}
|
||||
if desc == "" {
|
||||
desc = "No Description"
|
||||
}
|
||||
rank := domain[0]
|
||||
domainName := domain[1]
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// Write unique domain to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL,
|
||||
sanitize(title),
|
||||
sanitize(keywords),
|
||||
sanitize(desc),
|
||||
rank,
|
||||
)
|
||||
if _, err := file.WriteString(line); err != nil {
|
||||
return err
|
||||
}
|
||||
mu.Lock()
|
||||
if domainName == "" || existingEntries[fullURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
existingEntries[fullURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
existingEntries[fullURL] = true
|
||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||
if title == "" {
|
||||
title = "Unknown Title"
|
||||
}
|
||||
if desc == "" {
|
||||
desc = "No Description"
|
||||
}
|
||||
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL,
|
||||
sanitize(title),
|
||||
sanitize(keywords),
|
||||
sanitize(desc),
|
||||
rank,
|
||||
)
|
||||
file.WriteString(line)
|
||||
}(d)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue