added more config values for indexing + fixed value handling when Its missing in config file

2024-12-30 17:19:20 +01:00 · 2024-12-30 17:19:20 +01:00 · 13e1d6119b
commit 13e1d6119b
parent 047cccd19f
3 changed files with 153 additions and 94 deletions
--- a/crawler.go
+++ b/crawler.go
@ -7,6 +7,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
 	"time"

 	"golang.org/x/net/html"
@ -18,8 +19,8 @@ func webCrawlerInit() {
 		// First run immediately
 		runCrawlerAndIndexer()

-		// Then every 24h (adjust as needed)
-		ticker := time.NewTicker(24 * time.Hour)
+		// Then run periodically based on CrawlingInterval
+		ticker := time.NewTicker(config.CrawlingInterval)
 		for range ticker.C {
 			runCrawlerAndIndexer()
 		}
@ -37,16 +38,13 @@ func runCrawlerAndIndexer() {

 	// 2. Crawl each domain and write results to data_to_index.txt
 	outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
-	if err := crawlDomainsToFile(domains, outFile); err != nil {
+	if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
 		printErr("Error crawling domains: %v", err)
 		return
 	}

-	// 3. Re-index data_to_index.txt
-	if err := IndexFile(outFile); err != nil {
-		printErr("Error indexing data_to_index.txt: %v", err)
-		return
-	}
+	// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
+	startPeriodicIndexing(outFile, config.IndexRefreshInterval)

 	printDebug("Crawl + index refresh completed.")
 }
@ -81,10 +79,11 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
 }

 // crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
-func crawlDomainsToFile(domains [][2]string, outFile string) error {
-	// Read existing data_to_index.txt into a map to prevent duplicates
+func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
 	existingEntries := make(map[string]bool)
-	if _, err := os.Stat(outFile); err == nil { // File exists
+	var mu sync.Mutex // Mutex to protect access to the map
+
+	if _, err := os.Stat(outFile); err == nil {
 		file, err := os.Open(outFile)
 		if err != nil {
 			return fmt.Errorf("unable to open %s: %v", outFile, err)
@ -96,7 +95,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
 			line := scanner.Text()
 			parts := strings.SplitN(line, "|", 5)
 			if len(parts) >= 1 {
-				existingEntries[parts[0]] = true // Mark existing domain
+				existingEntries[parts[0]] = true
 			}
 		}
 	}
@ -108,37 +107,48 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
 	}
 	defer file.Close()

+	semaphore := make(chan struct{}, concurrentCrawlers)
+	var wg sync.WaitGroup
+
 	for _, d := range domains {
-		rank := d[0]
-		domain := d[1]
-		if domain == "" || existingEntries["https://"+domain] {
-			continue
-		}
+		wg.Add(1)
+		semaphore <- struct{}{}
+		go func(domain [2]string) {
+			defer wg.Done()
+			defer func() { <-semaphore }()

-		fullURL := "https://" + domain
-		title, desc, keywords := fetchPageMetadata(fullURL)
-		if title == "" {
-			title = "Unknown Title"
-		}
-		if desc == "" {
-			desc = "No Description"
-		}
+			rank := domain[0]
+			domainName := domain[1]
+			fullURL := "https://" + domainName

-		// Write unique domain to file
-		line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
-			fullURL,
-			sanitize(title),
-			sanitize(keywords),
-			sanitize(desc),
-			rank,
-		)
-		if _, err := file.WriteString(line); err != nil {
-			return err
-		}
+			mu.Lock()
+			if domainName == "" || existingEntries[fullURL] {
+				mu.Unlock()
+				return
+			}
+			existingEntries[fullURL] = true
+			mu.Unlock()

-		existingEntries[fullURL] = true
+			title, desc, keywords := fetchPageMetadata(fullURL)
+			if title == "" {
+				title = "Unknown Title"
+			}
+			if desc == "" {
+				desc = "No Description"
+			}
+
+			line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
+				fullURL,
+				sanitize(title),
+				sanitize(keywords),
+				sanitize(desc),
+				rank,
+			)
+			file.WriteString(line)
+		}(d)
 	}

+	wg.Wait()
 	return nil
 }