improved crawler data extraction

2025-01-01 13:49:16 +01:00 · 2025-01-01 13:49:16 +01:00 · 3494457336
commit 3494457336
parent a9a6948a44
4 changed files with 231 additions and 92 deletions
--- a/crawler.go
+++ b/crawler.go
@ -3,14 +3,11 @@ package main
 import (
 	"bufio"
 	"fmt"
-	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
 	"time"
-
-	"golang.org/x/net/html"
 )

 // webCrawlerInit is called during init on program start
@ -130,18 +127,18 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
 			mu.Unlock()

 			title, desc, keywords := fetchPageMetadata(fullURL)
-			if title == "" {
-				title = "Unknown Title"
-			}
-			if desc == "" {
-				desc = "No Description"
+
+			// Skip saving if title or description is missing
+			if title == "" || desc == "" {
+				printDebug("Skipping %s: missing title or description", fullURL)
+				return
 			}

 			line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
 				fullURL,
-				sanitize(title),
-				sanitize(keywords),
-				sanitize(desc),
+				title,
+				keywords,
+				desc,
 				rank,
 			)
 			file.WriteString(line)
@ -151,84 +148,3 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
 	wg.Wait()
 	return nil
 }
-
-// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords]
-func fetchPageMetadata(pageURL string) (string, string, string) {
-	// Generate a User-Agent using your GetUserAgent function
-	userAgent, err := GetUserAgent("crawler")
-	if err != nil {
-		printWarn("Failed to generate User-Agent: %v", err)
-		return "", "", ""
-	}
-
-	client := &http.Client{Timeout: 15 * time.Second}
-	req, err := http.NewRequest("GET", pageURL, nil)
-	if err != nil {
-		printWarn("Failed to create request for %s: %v", pageURL, err)
-		return "", "", ""
-	}
-
-	// Set the dynamically generated User-Agent
-	req.Header.Set("User-Agent", userAgent)
-
-	resp, err := client.Do(req)
-	if err != nil {
-		printWarn("Failed to GET %s: %v", pageURL, err)
-		return "", "", ""
-	}
-	defer resp.Body.Close()
-
-	// Handle non-200 responses
-	if resp.StatusCode == 403 || resp.StatusCode == 401 {
-		printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode)
-		return "", "", ""
-	} else if resp.StatusCode < 200 || resp.StatusCode >= 300 {
-		printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode)
-		return "", "", ""
-	}
-
-	// Parse HTML
-	doc, err := html.Parse(resp.Body)
-	if err != nil {
-		printWarn("HTML parse error for %s: %v", pageURL, err)
-		return "", "", ""
-	}
-
-	var title, desc, keywords string
-	var f func(*html.Node)
-	f = func(n *html.Node) {
-		if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
-			title = n.FirstChild.Data
-		}
-		if n.Type == html.ElementNode && n.Data == "meta" {
-			var nameVal, contentVal string
-			for _, attr := range n.Attr {
-				switch strings.ToLower(attr.Key) {
-				case "name":
-					nameVal = strings.ToLower(attr.Val)
-				case "content":
-					contentVal = attr.Val
-				}
-			}
-			if nameVal == "description" {
-				desc = contentVal
-			} else if nameVal == "keywords" {
-				keywords = contentVal
-			}
-		}
-		for c := n.FirstChild; c != nil; c = c.NextSibling {
-			f(c)
-		}
-	}
-	f(doc)
-
-	return title, desc, keywords
-}
-
-// sanitize is a quick helper to remove newlines/pipes from fields
-func sanitize(input string) string {
-	input = strings.ReplaceAll(input, "|", " ")
-	input = strings.ReplaceAll(input, "\n", " ")
-	input = strings.TrimSpace(input)
-	return input
-}