Search/crawler.go

151 lines
3.7 KiB
Go
Raw Normal View History

package main
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"time"
)
// webCrawlerInit is called during init on program start
func webCrawlerInit() {
go func() {
// First run immediately
runCrawlerAndIndexer()
// Then run periodically based on CrawlingInterval
ticker := time.NewTicker(config.CrawlingInterval)
for range ticker.C {
runCrawlerAndIndexer()
}
}()
}
// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes
func runCrawlerAndIndexer() {
// 1. Read domains.csv
domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))
if err != nil {
printErr("Error reading domains.csv: %v", err)
return
}
// 2. Crawl each domain and write results to data_to_index.txt
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
printErr("Error crawling domains: %v", err)
return
}
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
startPeriodicIndexing(outFile, config.IndexRefreshInterval)
printDebug("Crawl + index refresh completed.")
}
// readDomainsCSV returns a slice of (rank,domain) from a local CSV file
func readDomainsCSV(csvPath string) ([][2]string, error) {
f, err := os.Open(csvPath)
if err != nil {
return nil, err
}
defer f.Close()
var result [][2]string
scanner := bufio.NewScanner(f)
// Skip header line
scanner.Scan()
for scanner.Scan() {
line := scanner.Text()
// Split by commas, not tabs
fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)
if len(fields) < 2 {
printDebug("Skipping malformed line: %s", line)
continue
}
// Remove quotes around fields, if present
rank := strings.Trim(fields[0], `"`)
domain := strings.Trim(fields[1], `"`)
result = append(result, [2]string{rank, domain})
}
return result, scanner.Err()
}
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
existingEntries := make(map[string]bool)
var mu sync.Mutex // Mutex to protect access to the map
if _, err := os.Stat(outFile); err == nil {
file, err := os.Open(outFile)
if err != nil {
return fmt.Errorf("unable to open %s: %v", outFile, err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, "|", 5)
if len(parts) >= 1 {
existingEntries[parts[0]] = true
}
}
}
// Open file for writing (truncate if existing)
file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
if err != nil {
return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
}
defer file.Close()
semaphore := make(chan struct{}, concurrentCrawlers)
var wg sync.WaitGroup
for _, d := range domains {
wg.Add(1)
semaphore <- struct{}{}
go func(domain [2]string) {
defer wg.Done()
defer func() { <-semaphore }()
rank := domain[0]
domainName := domain[1]
fullURL := "https://" + domainName
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
mu.Unlock()
return
}
existingEntries[fullURL] = true
mu.Unlock()
title, desc, keywords := fetchPageMetadata(fullURL)
2025-01-01 13:49:16 +01:00
// Skip saving if title or description is missing
if title == "" || desc == "" {
printDebug("Skipping %s: missing title or description", fullURL)
return
}
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL,
2025-01-01 13:49:16 +01:00
title,
keywords,
desc,
rank,
)
file.WriteString(line)
}(d)
}
wg.Wait()
return nil
}