Search/crawler.go

225 lines
5.6 KiB
Go
Raw Normal View History

package main
import (
"bufio"
"fmt"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"golang.org/x/net/html"
)
// webCrawlerInit is called during init on program start
func webCrawlerInit() {
go func() {
// First run immediately
runCrawlerAndIndexer()
// Then every 24h (adjust as needed)
ticker := time.NewTicker(24 * time.Hour)
for range ticker.C {
runCrawlerAndIndexer()
}
}()
}
// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes
func runCrawlerAndIndexer() {
// 1. Read domains.csv
domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))
if err != nil {
printErr("Error reading domains.csv: %v", err)
return
}
// 2. Crawl each domain and write results to data_to_index.txt
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
if err := crawlDomainsToFile(domains, outFile); err != nil {
printErr("Error crawling domains: %v", err)
return
}
// 3. Re-index data_to_index.txt
if err := IndexFile(outFile); err != nil {
printErr("Error indexing data_to_index.txt: %v", err)
return
}
printDebug("Crawl + index refresh completed.")
}
// readDomainsCSV returns a slice of (rank,domain) from a local CSV file
func readDomainsCSV(csvPath string) ([][2]string, error) {
f, err := os.Open(csvPath)
if err != nil {
return nil, err
}
defer f.Close()
var result [][2]string
scanner := bufio.NewScanner(f)
// Skip header line
scanner.Scan()
for scanner.Scan() {
line := scanner.Text()
// Split by commas, not tabs
fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)
if len(fields) < 2 {
printDebug("Skipping malformed line: %s", line)
continue
}
// Remove quotes around fields, if present
rank := strings.Trim(fields[0], `"`)
domain := strings.Trim(fields[1], `"`)
result = append(result, [2]string{rank, domain})
}
return result, scanner.Err()
}
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
func crawlDomainsToFile(domains [][2]string, outFile string) error {
// Read existing data_to_index.txt into a map to prevent duplicates
existingEntries := make(map[string]bool)
if _, err := os.Stat(outFile); err == nil { // File exists
file, err := os.Open(outFile)
if err != nil {
return fmt.Errorf("unable to open %s: %v", outFile, err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
parts := strings.SplitN(line, "|", 5)
if len(parts) >= 1 {
existingEntries[parts[0]] = true // Mark existing domain
}
}
}
// Open file for writing (truncate if existing)
file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
if err != nil {
return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
}
defer file.Close()
for _, d := range domains {
rank := d[0]
domain := d[1]
if domain == "" || existingEntries["https://"+domain] {
continue
}
fullURL := "https://" + domain
title, desc, keywords := fetchPageMetadata(fullURL)
if title == "" {
title = "Unknown Title"
}
if desc == "" {
desc = "No Description"
}
// Write unique domain to file
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL,
sanitize(title),
sanitize(keywords),
sanitize(desc),
rank,
)
if _, err := file.WriteString(line); err != nil {
return err
}
existingEntries[fullURL] = true
}
return nil
}
// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords]
func fetchPageMetadata(pageURL string) (string, string, string) {
// Generate a User-Agent using your GetUserAgent function
userAgent, err := GetUserAgent("crawler")
if err != nil {
printWarn("Failed to generate User-Agent: %v", err)
return "", "", ""
}
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printWarn("Failed to create request for %s: %v", pageURL, err)
return "", "", ""
}
// Set the dynamically generated User-Agent
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
printWarn("Failed to GET %s: %v", pageURL, err)
return "", "", ""
}
defer resp.Body.Close()
// Handle non-200 responses
if resp.StatusCode == 403 || resp.StatusCode == 401 {
printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode)
return "", "", ""
} else if resp.StatusCode < 200 || resp.StatusCode >= 300 {
printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode)
return "", "", ""
}
// Parse HTML
doc, err := html.Parse(resp.Body)
if err != nil {
printWarn("HTML parse error for %s: %v", pageURL, err)
return "", "", ""
}
var title, desc, keywords string
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
title = n.FirstChild.Data
}
if n.Type == html.ElementNode && n.Data == "meta" {
var nameVal, contentVal string
for _, attr := range n.Attr {
switch strings.ToLower(attr.Key) {
case "name":
nameVal = strings.ToLower(attr.Val)
case "content":
contentVal = attr.Val
}
}
if nameVal == "description" {
desc = contentVal
} else if nameVal == "keywords" {
keywords = contentVal
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return title, desc, keywords
}
// sanitize is a quick helper to remove newlines/pipes from fields
func sanitize(input string) string {
input = strings.ReplaceAll(input, "|", " ")
input = strings.ReplaceAll(input, "\n", " ")
input = strings.TrimSpace(input)
return input
}