package main import ( "bufio" "fmt" "net/http" "os" "path/filepath" "strings" "sync" "time" "golang.org/x/net/html" ) // webCrawlerInit is called during init on program start func webCrawlerInit() { go func() { // First run immediately runCrawlerAndIndexer() // Then run periodically based on CrawlingInterval ticker := time.NewTicker(config.CrawlingInterval) for range ticker.C { runCrawlerAndIndexer() } }() } // runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes func runCrawlerAndIndexer() { // 1. Read domains.csv domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv")) if err != nil { printErr("Error reading domains.csv: %v", err) return } // 2. Crawl each domain and write results to data_to_index.txt outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil { printErr("Error crawling domains: %v", err) return } // 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval startPeriodicIndexing(outFile, config.IndexRefreshInterval) printDebug("Crawl + index refresh completed.") } // readDomainsCSV returns a slice of (rank,domain) from a local CSV file func readDomainsCSV(csvPath string) ([][2]string, error) { f, err := os.Open(csvPath) if err != nil { return nil, err } defer f.Close() var result [][2]string scanner := bufio.NewScanner(f) // Skip header line scanner.Scan() for scanner.Scan() { line := scanner.Text() // Split by commas, not tabs fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity) if len(fields) < 2 { printDebug("Skipping malformed line: %s", line) continue } // Remove quotes around fields, if present rank := strings.Trim(fields[0], `"`) domain := strings.Trim(fields[1], `"`) result = append(result, [2]string{rank, domain}) } return result, scanner.Err() } // crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error { existingEntries := make(map[string]bool) var mu sync.Mutex // Mutex to protect access to the map if _, err := os.Stat(outFile); err == nil { file, err := os.Open(outFile) if err != nil { return fmt.Errorf("unable to open %s: %v", outFile, err) } defer file.Close() scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() parts := strings.SplitN(line, "|", 5) if len(parts) >= 1 { existingEntries[parts[0]] = true } } } // Open file for writing (truncate if existing) file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return fmt.Errorf("unable to open %s for writing: %v", outFile, err) } defer file.Close() semaphore := make(chan struct{}, concurrentCrawlers) var wg sync.WaitGroup for _, d := range domains { wg.Add(1) semaphore <- struct{}{} go func(domain [2]string) { defer wg.Done() defer func() { <-semaphore }() rank := domain[0] domainName := domain[1] fullURL := "https://" + domainName mu.Lock() if domainName == "" || existingEntries[fullURL] { mu.Unlock() return } existingEntries[fullURL] = true mu.Unlock() title, desc, keywords := fetchPageMetadata(fullURL) if title == "" { title = "Unknown Title" } if desc == "" { desc = "No Description" } line := fmt.Sprintf("%s|%s|%s|%s|%s\n", fullURL, sanitize(title), sanitize(keywords), sanitize(desc), rank, ) file.WriteString(line) }(d) } wg.Wait() return nil } // fetchPageMetadata does a simple GET and parses , meta[name=description], meta[name=keywords] func fetchPageMetadata(pageURL string) (string, string, string) { // Generate a User-Agent using your GetUserAgent function userAgent, err := GetUserAgent("crawler") if err != nil { printWarn("Failed to generate User-Agent: %v", err) return "", "", "" } client := &http.Client{Timeout: 15 * time.Second} req, err := http.NewRequest("GET", pageURL, nil) if err != nil { printWarn("Failed to create request for %s: %v", pageURL, err) return "", "", "" } // Set the dynamically generated User-Agent req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err != nil { printWarn("Failed to GET %s: %v", pageURL, err) return "", "", "" } defer resp.Body.Close() // Handle non-200 responses if resp.StatusCode == 403 || resp.StatusCode == 401 { printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode) return "", "", "" } else if resp.StatusCode < 200 || resp.StatusCode >= 300 { printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode) return "", "", "" } // Parse HTML doc, err := html.Parse(resp.Body) if err != nil { printWarn("HTML parse error for %s: %v", pageURL, err) return "", "", "" } var title, desc, keywords string var f func(*html.Node) f = func(n *html.Node) { if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil { title = n.FirstChild.Data } if n.Type == html.ElementNode && n.Data == "meta" { var nameVal, contentVal string for _, attr := range n.Attr { switch strings.ToLower(attr.Key) { case "name": nameVal = strings.ToLower(attr.Val) case "content": contentVal = attr.Val } } if nameVal == "description" { desc = contentVal } else if nameVal == "keywords" { keywords = contentVal } } for c := n.FirstChild; c != nil; c = c.NextSibling { f(c) } } f(doc) return title, desc, keywords } // sanitize is a quick helper to remove newlines/pipes from fields func sanitize(input string) string { input = strings.ReplaceAll(input, "|", " ") input = strings.ReplaceAll(input, "\n", " ") input = strings.TrimSpace(input) return input }