Search/crawler.go

package main

import (
	"bufio"
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"time"
)

// Create a global or config-level visited store
var visitedStore *VisitedStore

// webCrawlerInit is called during init on program start
func webCrawlerInit() {
	// Initialize the store with, say, batchSize=50
	store, err := NewVisitedStore(filepath.Join(config.DriveCache.Path, "visited-urls.txt"), 50)
	if err != nil {
		printErr("Failed to initialize visited store: %v", err)
	}
	visitedStore = store

	// Start the periodic crawler
	go func() {
		// First run immediately
		runCrawlerAndIndexer()

		// Then run periodically
		ticker := time.NewTicker(config.CrawlingInterval)
		for range ticker.C {
			runCrawlerAndIndexer()
		}
	}()
}

// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes
func runCrawlerAndIndexer() {
	// 1. Read domains.csv
	domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))
	if err != nil {
		printErr("Error reading domains.csv: %v", err)
		return
	}

	// 2. Crawl each domain and write results to data_to_index.txt
	outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
	if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
		printErr("Error crawling domains: %v", err)
		return
	}

	// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
	startPeriodicIndexing(outFile, config.IndexRefreshInterval)

	printDebug("Crawl + index refresh completed.")
}

// readDomainsCSV returns a slice of (rank,domain) from a local CSV file
func readDomainsCSV(csvPath string) ([][2]string, error) {
	f, err := os.Open(csvPath)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	var result [][2]string
	scanner := bufio.NewScanner(f)
	// Skip header line
	scanner.Scan()

	for scanner.Scan() {
		line := scanner.Text()
		// Split by commas, not tabs
		fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)
		if len(fields) < 2 {
			printDebug("Skipping malformed line: %s", line)
			continue
		}
		// Remove quotes around fields, if present
		rank := strings.Trim(fields[0], `"`)
		domain := strings.Trim(fields[1], `"`)
		result = append(result, [2]string{rank, domain})
	}
	return result, scanner.Err()
}

// crawlDomainsToFile does an async pipeline:
//  1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
//  2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {

	var mu sync.Mutex

	// Open file for writing (truncate if existing)
	file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
	if err != nil {
		return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
	}
	defer file.Close()

	// Prepare channels
	standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
	chromeCh := make(chan [2]string, 1000)

	// 1) Spawn standard workers
	var wgStandard sync.WaitGroup
	for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
		wgStandard.Add(1)
		go func() {
			defer wgStandard.Done()
			for dom := range standardCh {
				rank := dom[0]
				domainName := dom[1]
				if domainName == "" {
					continue
				}
				fullURL := "https://" + domainName

				// 1. Check if we've already visited this URL
				added, err := visitedStore.MarkVisited(fullURL)
				if err != nil {
					printErr("MarkVisited error for %s: %v", fullURL, err)
					continue
				}
				if !added {
					// Already visited
					continue
				}

				// 2. Standard extraction
				userAgent, _ := GetUserAgent("crawler-std")
				title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)

				// If missing, push to Chrome queue
				if title == "" || desc == "" {
					chromeCh <- dom
					continue
				}

				// 3. Write to file
				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
					fullURL, title, keywords, desc, rank)

				mu.Lock()
				_, _ = file.WriteString(line)
				mu.Unlock()
			}
		}()
	}

	// 2) Spawn chrome workers
	var wgChrome sync.WaitGroup
	for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
		wgChrome.Add(1)
		go func() {
			defer wgChrome.Done()
			for dom := range chromeCh {
				rank := dom[0]
				domainName := dom[1]
				if domainName == "" {
					continue
				}
				fullURL := "https://" + domainName

				// We already marked it visited in the standard pass
				// but you may re-check if you prefer:
				//
				// added, err := visitedStore.MarkVisited(fullURL)
				// if err != nil { ... }
				// if !added { continue }

				// 3. Chromedp fallback extraction
				userAgent, _ := GetUserAgent("crawler-chrome")
				title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
				if title == "" || desc == "" {
					printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
					continue
				}

				// 4. Write to file
				line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
					fullURL, title, keywords, desc, rank)

				mu.Lock()
				_, _ = file.WriteString(line)
				mu.Unlock()
			}
		}()
	}

	// Feed domains into standardCh
	go func() {
		for _, dom := range domains {
			standardCh <- dom
		}
		// close the standardCh once all are queued
		close(standardCh)
	}()

	// Wait for standard workers to finish, then close chromeCh
	go func() {
		wgStandard.Wait()
		close(chromeCh)
	}()

	// Wait for chrome workers to finish
	wgChrome.Wait()

	// Optionally flush the visited store once more
	if visitedStore != nil {
		if err := visitedStore.Flush(); err != nil {
			printErr("visitedStore flush error: %v", err)
		}
	}

	return nil
}