Search/crawler.go

package main

import (
	"bufio"
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"time"
)

// webCrawlerInit is called during init on program start
func webCrawlerInit() {
	go func() {
		// First run immediately
		runCrawlerAndIndexer()

		// Then run periodically based on CrawlingInterval
		ticker := time.NewTicker(config.CrawlingInterval)
		for range ticker.C {
			runCrawlerAndIndexer()
		}
	}()
}

// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes
func runCrawlerAndIndexer() {
	// 1. Read domains.csv
	domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))
	if err != nil {
		printErr("Error reading domains.csv: %v", err)
		return
	}

	// 2. Crawl each domain and write results to data_to_index.txt
	outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
	if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
		printErr("Error crawling domains: %v", err)
		return
	}

	// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
	startPeriodicIndexing(outFile, config.IndexRefreshInterval)

	printDebug("Crawl + index refresh completed.")
}

// readDomainsCSV returns a slice of (rank,domain) from a local CSV file
func readDomainsCSV(csvPath string) ([][2]string, error) {
	f, err := os.Open(csvPath)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	var result [][2]string
	scanner := bufio.NewScanner(f)
	// Skip header line
	scanner.Scan()

	for scanner.Scan() {
		line := scanner.Text()
		// Split by commas, not tabs
		fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)
		if len(fields) < 2 {
			printDebug("Skipping malformed line: %s", line)
			continue
		}
		// Remove quotes around fields, if present
		rank := strings.Trim(fields[0], `"`)
		domain := strings.Trim(fields[1], `"`)
		result = append(result, [2]string{rank, domain})
	}
	return result, scanner.Err()
}

// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
	existingEntries := make(map[string]bool)
	var mu sync.Mutex // Mutex to protect access to the map

	if _, err := os.Stat(outFile); err == nil {
		file, err := os.Open(outFile)
		if err != nil {
			return fmt.Errorf("unable to open %s: %v", outFile, err)
		}
		defer file.Close()

		scanner := bufio.NewScanner(file)
		for scanner.Scan() {
			line := scanner.Text()
			parts := strings.SplitN(line, "|", 5)
			if len(parts) >= 1 {
				existingEntries[parts[0]] = true
			}
		}
	}

	// Open file for writing (truncate if existing)
	file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
	if err != nil {
		return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
	}
	defer file.Close()

	semaphore := make(chan struct{}, concurrentCrawlers)
	var wg sync.WaitGroup

	for _, d := range domains {
		wg.Add(1)
		semaphore <- struct{}{}
		go func(domain [2]string) {
			defer wg.Done()
			defer func() { <-semaphore }()

			rank := domain[0]
			domainName := domain[1]
			fullURL := "https://" + domainName

			mu.Lock()
			if domainName == "" || existingEntries[fullURL] {
				mu.Unlock()
				return
			}
			existingEntries[fullURL] = true
			mu.Unlock()

			title, desc, keywords := fetchPageMetadata(fullURL)

			// Skip saving if title or description is missing
			if title == "" || desc == "" {
				printDebug("Skipping %s: missing title or description", fullURL)
				return
			}

			line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
				fullURL,
				title,
				keywords,
				desc,
				rank,
			)
			file.WriteString(line)
		}(d)
	}

	wg.Wait()
	return nil
}
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`package main`

			`import (`
			`"bufio"`
			`"fmt"`
			`"os"`
			`"path/filepath"`
			`"strings"`
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`"sync"`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`"time"`
			`)`

			`// webCrawlerInit is called during init on program start`
			`func webCrawlerInit() {`
			`go func() {`
			`// First run immediately`
			`runCrawlerAndIndexer()`

added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`// Then run periodically based on CrawlingInterval`
			`ticker := time.NewTicker(config.CrawlingInterval)`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`for range ticker.C {`
			`runCrawlerAndIndexer()`
			`}`
			`}()`
			`}`

			`// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes`
			`func runCrawlerAndIndexer() {`
			`// 1. Read domains.csv`
			`domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))`
			`if err != nil {`
			`printErr("Error reading domains.csv: %v", err)`
			`return`
			`}`

			`// 2. Crawl each domain and write results to data_to_index.txt`
			`outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")`
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`printErr("Error crawling domains: %v", err)`
			`return`
			`}`

added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval`
			`startPeriodicIndexing(outFile, config.IndexRefreshInterval)`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00
			`printDebug("Crawl + index refresh completed.")`
			`}`

			`// readDomainsCSV returns a slice of (rank,domain) from a local CSV file`
			`func readDomainsCSV(csvPath string) ([][2]string, error) {`
			`f, err := os.Open(csvPath)`
			`if err != nil {`
			`return nil, err`
			`}`
			`defer f.Close()`

			`var result [][2]string`
			`scanner := bufio.NewScanner(f)`
			`// Skip header line`
			`scanner.Scan()`

			`for scanner.Scan() {`
			`line := scanner.Text()`
			`// Split by commas, not tabs`
			`fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)`
			`if len(fields) < 2 {`
			`printDebug("Skipping malformed line: %s", line)`
			`continue`
			`}`
			`// Remove quotes around fields, if present`
			rank := strings.Trim(fields[0], `"`)
			domain := strings.Trim(fields[1], `"`)
			`result = append(result, [2]string{rank, domain})`
			`}`
			`return result, scanner.Err()`
			`}`

			`// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile`
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`existingEntries := make(map[string]bool)`
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`var mu sync.Mutex // Mutex to protect access to the map`

			`if _, err := os.Stat(outFile); err == nil {`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`file, err := os.Open(outFile)`
			`if err != nil {`
			`return fmt.Errorf("unable to open %s: %v", outFile, err)`
			`}`
			`defer file.Close()`

			`scanner := bufio.NewScanner(file)`
			`for scanner.Scan() {`
			`line := scanner.Text()`
			`parts := strings.SplitN(line, "\|", 5)`
			`if len(parts) >= 1 {`
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`existingEntries[parts[0]] = true`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`}`
			`}`
			`}`

			`// Open file for writing (truncate if existing)`
			`file, err := os.OpenFile(outFile, os.O_CREATE\|os.O_WRONLY\|os.O_TRUNC, 0644)`
			`if err != nil {`
			`return fmt.Errorf("unable to open %s for writing: %v", outFile, err)`
			`}`
			`defer file.Close()`

added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`semaphore := make(chan struct{}, concurrentCrawlers)`
			`var wg sync.WaitGroup`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`for _, d := range domains {`
			`wg.Add(1)`
			`semaphore <- struct{}{}`
			`go func(domain [2]string) {`
			`defer wg.Done()`
			`defer func() { <-semaphore }()`

			`rank := domain[0]`
			`domainName := domain[1]`
			`fullURL := "https://" + domainName`

			`mu.Lock()`
			`if domainName == "" \|\| existingEntries[fullURL] {`
			`mu.Unlock()`
			`return`
			`}`
			`existingEntries[fullURL] = true`
			`mu.Unlock()`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`title, desc, keywords := fetchPageMetadata(fullURL)`
improved crawler data extraction 2025-01-01 13:49:16 +01:00
			`// Skip saving if title or description is missing`
			`if title == "" \|\| desc == "" {`
			`printDebug("Skipping %s: missing title or description", fullURL)`
			`return`
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`}`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`line := fmt.Sprintf("%s\|%s\|%s\|%s\|%s\n",`
			`fullURL,`
improved crawler data extraction 2025-01-01 13:49:16 +01:00			`title,`
			`keywords,`
			`desc,`
added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`rank,`
			`)`
			`file.WriteString(line)`
			`}(d)`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`}`

added more config values for indexing + fixed value handling when Its missing in config file 2024-12-30 17:19:20 +01:00			`wg.Wait()`
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`return nil`
			`}`