improved crawler data extraction

This commit is contained in:
partisan 2025-01-01 13:49:16 +01:00
parent a9a6948a44
commit 3494457336
4 changed files with 231 additions and 92 deletions

View file

@ -3,14 +3,11 @@ package main
import (
"bufio"
"fmt"
"net/http"
"os"
"path/filepath"
"strings"
"sync"
"time"
"golang.org/x/net/html"
)
// webCrawlerInit is called during init on program start
@ -130,18 +127,18 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
mu.Unlock()
title, desc, keywords := fetchPageMetadata(fullURL)
if title == "" {
title = "Unknown Title"
}
if desc == "" {
desc = "No Description"
// Skip saving if title or description is missing
if title == "" || desc == "" {
printDebug("Skipping %s: missing title or description", fullURL)
return
}
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL,
sanitize(title),
sanitize(keywords),
sanitize(desc),
title,
keywords,
desc,
rank,
)
file.WriteString(line)
@ -151,84 +148,3 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
wg.Wait()
return nil
}
// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords]
func fetchPageMetadata(pageURL string) (string, string, string) {
// Generate a User-Agent using your GetUserAgent function
userAgent, err := GetUserAgent("crawler")
if err != nil {
printWarn("Failed to generate User-Agent: %v", err)
return "", "", ""
}
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printWarn("Failed to create request for %s: %v", pageURL, err)
return "", "", ""
}
// Set the dynamically generated User-Agent
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
printWarn("Failed to GET %s: %v", pageURL, err)
return "", "", ""
}
defer resp.Body.Close()
// Handle non-200 responses
if resp.StatusCode == 403 || resp.StatusCode == 401 {
printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode)
return "", "", ""
} else if resp.StatusCode < 200 || resp.StatusCode >= 300 {
printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode)
return "", "", ""
}
// Parse HTML
doc, err := html.Parse(resp.Body)
if err != nil {
printWarn("HTML parse error for %s: %v", pageURL, err)
return "", "", ""
}
var title, desc, keywords string
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
title = n.FirstChild.Data
}
if n.Type == html.ElementNode && n.Data == "meta" {
var nameVal, contentVal string
for _, attr := range n.Attr {
switch strings.ToLower(attr.Key) {
case "name":
nameVal = strings.ToLower(attr.Val)
case "content":
contentVal = attr.Val
}
}
if nameVal == "description" {
desc = contentVal
} else if nameVal == "keywords" {
keywords = contentVal
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return title, desc, keywords
}
// sanitize is a quick helper to remove newlines/pipes from fields
func sanitize(input string) string {
input = strings.ReplaceAll(input, "|", " ")
input = strings.ReplaceAll(input, "\n", " ")
input = strings.TrimSpace(input)
return input
}