improved crawler data extraction
This commit is contained in:
parent
a9a6948a44
commit
3494457336
4 changed files with 231 additions and 92 deletions
100
crawler.go
100
crawler.go
|
@ -3,14 +3,11 @@ package main
|
|||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// webCrawlerInit is called during init on program start
|
||||
|
@ -130,18 +127,18 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
|
|||
mu.Unlock()
|
||||
|
||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||
if title == "" {
|
||||
title = "Unknown Title"
|
||||
}
|
||||
if desc == "" {
|
||||
desc = "No Description"
|
||||
|
||||
// Skip saving if title or description is missing
|
||||
if title == "" || desc == "" {
|
||||
printDebug("Skipping %s: missing title or description", fullURL)
|
||||
return
|
||||
}
|
||||
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL,
|
||||
sanitize(title),
|
||||
sanitize(keywords),
|
||||
sanitize(desc),
|
||||
title,
|
||||
keywords,
|
||||
desc,
|
||||
rank,
|
||||
)
|
||||
file.WriteString(line)
|
||||
|
@ -151,84 +148,3 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
|
|||
wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords]
|
||||
func fetchPageMetadata(pageURL string) (string, string, string) {
|
||||
// Generate a User-Agent using your GetUserAgent function
|
||||
userAgent, err := GetUserAgent("crawler")
|
||||
if err != nil {
|
||||
printWarn("Failed to generate User-Agent: %v", err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 15 * time.Second}
|
||||
req, err := http.NewRequest("GET", pageURL, nil)
|
||||
if err != nil {
|
||||
printWarn("Failed to create request for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
// Set the dynamically generated User-Agent
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
printWarn("Failed to GET %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Handle non-200 responses
|
||||
if resp.StatusCode == 403 || resp.StatusCode == 401 {
|
||||
printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode)
|
||||
return "", "", ""
|
||||
} else if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
// Parse HTML
|
||||
doc, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
printWarn("HTML parse error for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
var title, desc, keywords string
|
||||
var f func(*html.Node)
|
||||
f = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
|
||||
title = n.FirstChild.Data
|
||||
}
|
||||
if n.Type == html.ElementNode && n.Data == "meta" {
|
||||
var nameVal, contentVal string
|
||||
for _, attr := range n.Attr {
|
||||
switch strings.ToLower(attr.Key) {
|
||||
case "name":
|
||||
nameVal = strings.ToLower(attr.Val)
|
||||
case "content":
|
||||
contentVal = attr.Val
|
||||
}
|
||||
}
|
||||
if nameVal == "description" {
|
||||
desc = contentVal
|
||||
} else if nameVal == "keywords" {
|
||||
keywords = contentVal
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
f(c)
|
||||
}
|
||||
}
|
||||
f(doc)
|
||||
|
||||
return title, desc, keywords
|
||||
}
|
||||
|
||||
// sanitize is a quick helper to remove newlines/pipes from fields
|
||||
func sanitize(input string) string {
|
||||
input = strings.ReplaceAll(input, "|", " ")
|
||||
input = strings.ReplaceAll(input, "\n", " ")
|
||||
input = strings.TrimSpace(input)
|
||||
return input
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue