Search/crawler.go

package main

import (
	"bufio"
	"fmt"
	"net/http"
	"os"
	"path/filepath"
	"strings"
	"time"

	"golang.org/x/net/html"
)

// webCrawlerInit is called during init on program start
func webCrawlerInit() {
	go func() {
		// First run immediately
		runCrawlerAndIndexer()

		// Then every 24h (adjust as needed)
		ticker := time.NewTicker(24 * time.Hour)
		for range ticker.C {
			runCrawlerAndIndexer()
		}
	}()
}

// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes
func runCrawlerAndIndexer() {
	// 1. Read domains.csv
	domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))
	if err != nil {
		printErr("Error reading domains.csv: %v", err)
		return
	}

	// 2. Crawl each domain and write results to data_to_index.txt
	outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
	if err := crawlDomainsToFile(domains, outFile); err != nil {
		printErr("Error crawling domains: %v", err)
		return
	}

	// 3. Re-index data_to_index.txt
	if err := IndexFile(outFile); err != nil {
		printErr("Error indexing data_to_index.txt: %v", err)
		return
	}

	printDebug("Crawl + index refresh completed.")
}

// readDomainsCSV returns a slice of (rank,domain) from a local CSV file
func readDomainsCSV(csvPath string) ([][2]string, error) {
	f, err := os.Open(csvPath)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	var result [][2]string
	scanner := bufio.NewScanner(f)
	// Skip header line
	scanner.Scan()

	for scanner.Scan() {
		line := scanner.Text()
		// Split by commas, not tabs
		fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)
		if len(fields) < 2 {
			printDebug("Skipping malformed line: %s", line)
			continue
		}
		// Remove quotes around fields, if present
		rank := strings.Trim(fields[0], `"`)
		domain := strings.Trim(fields[1], `"`)
		result = append(result, [2]string{rank, domain})
	}
	return result, scanner.Err()
}

// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
func crawlDomainsToFile(domains [][2]string, outFile string) error {
	// Read existing data_to_index.txt into a map to prevent duplicates
	existingEntries := make(map[string]bool)
	if _, err := os.Stat(outFile); err == nil { // File exists
		file, err := os.Open(outFile)
		if err != nil {
			return fmt.Errorf("unable to open %s: %v", outFile, err)
		}
		defer file.Close()

		scanner := bufio.NewScanner(file)
		for scanner.Scan() {
			line := scanner.Text()
			parts := strings.SplitN(line, "|", 5)
			if len(parts) >= 1 {
				existingEntries[parts[0]] = true // Mark existing domain
			}
		}
	}

	// Open file for writing (truncate if existing)
	file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
	if err != nil {
		return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
	}
	defer file.Close()

	for _, d := range domains {
		rank := d[0]
		domain := d[1]
		if domain == "" || existingEntries["https://"+domain] {
			continue
		}

		fullURL := "https://" + domain
		title, desc, keywords := fetchPageMetadata(fullURL)
		if title == "" {
			title = "Unknown Title"
		}
		if desc == "" {
			desc = "No Description"
		}

		// Write unique domain to file
		line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
			fullURL,
			sanitize(title),
			sanitize(keywords),
			sanitize(desc),
			rank,
		)
		if _, err := file.WriteString(line); err != nil {
			return err
		}

		existingEntries[fullURL] = true
	}

	return nil
}

// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords]
func fetchPageMetadata(pageURL string) (string, string, string) {
	// Generate a User-Agent using your GetUserAgent function
	userAgent, err := GetUserAgent("crawler")
	if err != nil {
		printWarn("Failed to generate User-Agent: %v", err)
		return "", "", ""
	}

	client := &http.Client{Timeout: 15 * time.Second}
	req, err := http.NewRequest("GET", pageURL, nil)
	if err != nil {
		printWarn("Failed to create request for %s: %v", pageURL, err)
		return "", "", ""
	}

	// Set the dynamically generated User-Agent
	req.Header.Set("User-Agent", userAgent)

	resp, err := client.Do(req)
	if err != nil {
		printWarn("Failed to GET %s: %v", pageURL, err)
		return "", "", ""
	}
	defer resp.Body.Close()

	// Handle non-200 responses
	if resp.StatusCode == 403 || resp.StatusCode == 401 {
		printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode)
		return "", "", ""
	} else if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode)
		return "", "", ""
	}

	// Parse HTML
	doc, err := html.Parse(resp.Body)
	if err != nil {
		printWarn("HTML parse error for %s: %v", pageURL, err)
		return "", "", ""
	}

	var title, desc, keywords string
	var f func(*html.Node)
	f = func(n *html.Node) {
		if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
			title = n.FirstChild.Data
		}
		if n.Type == html.ElementNode && n.Data == "meta" {
			var nameVal, contentVal string
			for _, attr := range n.Attr {
				switch strings.ToLower(attr.Key) {
				case "name":
					nameVal = strings.ToLower(attr.Val)
				case "content":
					contentVal = attr.Val
				}
			}
			if nameVal == "description" {
				desc = contentVal
			} else if nameVal == "keywords" {
				keywords = contentVal
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			f(c)
		}
	}
	f(doc)

	return title, desc, keywords
}

// sanitize is a quick helper to remove newlines/pipes from fields
func sanitize(input string) string {
	input = strings.ReplaceAll(input, "|", " ")
	input = strings.ReplaceAll(input, "\n", " ")
	input = strings.TrimSpace(input)
	return input
}
added website crawling and indexing crawled results 2024-12-29 22:54:55 +01:00			`package main`

			`import (`
			`"bufio"`
			`"fmt"`
			`"net/http"`
			`"os"`
			`"path/filepath"`
			`"strings"`
			`"time"`

			`"golang.org/x/net/html"`
			`)`

			`// webCrawlerInit is called during init on program start`
			`func webCrawlerInit() {`
			`go func() {`
			`// First run immediately`
			`runCrawlerAndIndexer()`

			`// Then every 24h (adjust as needed)`
			`ticker := time.NewTicker(24 * time.Hour)`
			`for range ticker.C {`
			`runCrawlerAndIndexer()`
			`}`
			`}()`
			`}`

			`// runCrawlerAndIndexer reads domains.csv -> crawls -> writes to data_to_index.txt -> reindexes`
			`func runCrawlerAndIndexer() {`
			`// 1. Read domains.csv`
			`domains, err := readDomainsCSV(filepath.Join(config.DriveCache.Path, "domains.csv"))`
			`if err != nil {`
			`printErr("Error reading domains.csv: %v", err)`
			`return`
			`}`

			`// 2. Crawl each domain and write results to data_to_index.txt`
			`outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")`
			`if err := crawlDomainsToFile(domains, outFile); err != nil {`
			`printErr("Error crawling domains: %v", err)`
			`return`
			`}`

			`// 3. Re-index data_to_index.txt`
			`if err := IndexFile(outFile); err != nil {`
			`printErr("Error indexing data_to_index.txt: %v", err)`
			`return`
			`}`

			`printDebug("Crawl + index refresh completed.")`
			`}`

			`// readDomainsCSV returns a slice of (rank,domain) from a local CSV file`
			`func readDomainsCSV(csvPath string) ([][2]string, error) {`
			`f, err := os.Open(csvPath)`
			`if err != nil {`
			`return nil, err`
			`}`
			`defer f.Close()`

			`var result [][2]string`
			`scanner := bufio.NewScanner(f)`
			`// Skip header line`
			`scanner.Scan()`

			`for scanner.Scan() {`
			`line := scanner.Text()`
			`// Split by commas, not tabs`
			`fields := strings.SplitN(line, ",", 3) // Splits into up to 3 parts (rank, domain, popularity)`
			`if len(fields) < 2 {`
			`printDebug("Skipping malformed line: %s", line)`
			`continue`
			`}`
			`// Remove quotes around fields, if present`
			rank := strings.Trim(fields[0], `"`)
			domain := strings.Trim(fields[1], `"`)
			`result = append(result, [2]string{rank, domain})`
			`}`
			`return result, scanner.Err()`
			`}`

			`// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile`
			`func crawlDomainsToFile(domains [][2]string, outFile string) error {`
			`// Read existing data_to_index.txt into a map to prevent duplicates`
			`existingEntries := make(map[string]bool)`
			`if _, err := os.Stat(outFile); err == nil { // File exists`
			`file, err := os.Open(outFile)`
			`if err != nil {`
			`return fmt.Errorf("unable to open %s: %v", outFile, err)`
			`}`
			`defer file.Close()`

			`scanner := bufio.NewScanner(file)`
			`for scanner.Scan() {`
			`line := scanner.Text()`
			`parts := strings.SplitN(line, "\|", 5)`
			`if len(parts) >= 1 {`
			`existingEntries[parts[0]] = true // Mark existing domain`
			`}`
			`}`
			`}`

			`// Open file for writing (truncate if existing)`
			`file, err := os.OpenFile(outFile, os.O_CREATE\|os.O_WRONLY\|os.O_TRUNC, 0644)`
			`if err != nil {`
			`return fmt.Errorf("unable to open %s for writing: %v", outFile, err)`
			`}`
			`defer file.Close()`

			`for _, d := range domains {`
			`rank := d[0]`
			`domain := d[1]`
			`if domain == "" \|\| existingEntries["https://"+domain] {`
			`continue`
			`}`

			`fullURL := "https://" + domain`
			`title, desc, keywords := fetchPageMetadata(fullURL)`
			`if title == "" {`
			`title = "Unknown Title"`
			`}`
			`if desc == "" {`
			`desc = "No Description"`
			`}`

			`// Write unique domain to file`
			`line := fmt.Sprintf("%s\|%s\|%s\|%s\|%s\n",`
			`fullURL,`
			`sanitize(title),`
			`sanitize(keywords),`
			`sanitize(desc),`
			`rank,`
			`)`
			`if _, err := file.WriteString(line); err != nil {`
			`return err`
			`}`

			`existingEntries[fullURL] = true`
			`}`

			`return nil`
			`}`

			`// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords]`
			`func fetchPageMetadata(pageURL string) (string, string, string) {`
			`// Generate a User-Agent using your GetUserAgent function`
			`userAgent, err := GetUserAgent("crawler")`
			`if err != nil {`
			`printWarn("Failed to generate User-Agent: %v", err)`
			`return "", "", ""`
			`}`

			`client := &http.Client{Timeout: 15 * time.Second}`
			`req, err := http.NewRequest("GET", pageURL, nil)`
			`if err != nil {`
			`printWarn("Failed to create request for %s: %v", pageURL, err)`
			`return "", "", ""`
			`}`

			`// Set the dynamically generated User-Agent`
			`req.Header.Set("User-Agent", userAgent)`

			`resp, err := client.Do(req)`
			`if err != nil {`
			`printWarn("Failed to GET %s: %v", pageURL, err)`
			`return "", "", ""`
			`}`
			`defer resp.Body.Close()`

			`// Handle non-200 responses`
			`if resp.StatusCode == 403 \|\| resp.StatusCode == 401 {`
			`printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode)`
			`return "", "", ""`
			`} else if resp.StatusCode < 200 \|\| resp.StatusCode >= 300 {`
			`printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode)`
			`return "", "", ""`
			`}`

			`// Parse HTML`
			`doc, err := html.Parse(resp.Body)`
			`if err != nil {`
			`printWarn("HTML parse error for %s: %v", pageURL, err)`
			`return "", "", ""`
			`}`

			`var title, desc, keywords string`
			`var f func(*html.Node)`
			`f = func(n *html.Node) {`
			`if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {`
			`title = n.FirstChild.Data`
			`}`
			`if n.Type == html.ElementNode && n.Data == "meta" {`
			`var nameVal, contentVal string`
			`for _, attr := range n.Attr {`
			`switch strings.ToLower(attr.Key) {`
			`case "name":`
			`nameVal = strings.ToLower(attr.Val)`
			`case "content":`
			`contentVal = attr.Val`
			`}`
			`}`
			`if nameVal == "description" {`
			`desc = contentVal`
			`} else if nameVal == "keywords" {`
			`keywords = contentVal`
			`}`
			`}`
			`for c := n.FirstChild; c != nil; c = c.NextSibling {`
			`f(c)`
			`}`
			`}`
			`f(doc)`

			`return title, desc, keywords`
			`}`

			`// sanitize is a quick helper to remove newlines/pipes from fields`
			`func sanitize(input string) string {`
			`input = strings.ReplaceAll(input, "\|", " ")`
			`input = strings.ReplaceAll(input, "\n", " ")`
			`input = strings.TrimSpace(input)`
			`return input`
			`}`