improved crawler data extraction
This commit is contained in:
parent
a9a6948a44
commit
3494457336
4 changed files with 231 additions and 92 deletions
204
crawler-extraction.go
Normal file
204
crawler-extraction.go
Normal file
|
@ -0,0 +1,204 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-shiori/go-readability"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
|
||||
// OG, Twitter, then falls back to go-readability if needed. If after all that we
|
||||
// still have no title or no description, we return ("", "", "") so the caller
|
||||
// can skip saving it.
|
||||
//
|
||||
// 1. <title>, <meta name="description"/>, <meta name="keywords"/>
|
||||
// 2. <meta property="og:title">, <meta property="og:description">
|
||||
// 3. <meta name="twitter:title">, <meta name="twitter:description">
|
||||
// 4. go-readability fallback (if title or description is still missing)
|
||||
// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
|
||||
func fetchPageMetadata(pageURL string) (string, string, string) {
|
||||
userAgent, err := GetUserAgent("crawler")
|
||||
if err != nil {
|
||||
printDebug("Failed to generate User-Agent: %v", err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 15 * time.Second}
|
||||
req, err := http.NewRequest("GET", pageURL, nil)
|
||||
if err != nil {
|
||||
printDebug("Failed to create request for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
// Force English content when possible
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
printDebug("Failed to GET %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Skip non-2xx
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
// First pass: standard HTML parse
|
||||
doc, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
printDebug("HTML parse error for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
var (
|
||||
title, desc, keywords string
|
||||
ogTitle, ogDesc string
|
||||
twTitle, twDesc string
|
||||
foundTitle, foundDesc bool
|
||||
)
|
||||
|
||||
var walk func(*html.Node)
|
||||
walk = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
switch strings.ToLower(n.Data) {
|
||||
case "title":
|
||||
if n.FirstChild != nil {
|
||||
title = n.FirstChild.Data
|
||||
foundTitle = true
|
||||
}
|
||||
case "meta":
|
||||
var metaName, metaProperty, contentVal string
|
||||
for _, attr := range n.Attr {
|
||||
switch strings.ToLower(attr.Key) {
|
||||
case "name":
|
||||
metaName = strings.ToLower(attr.Val)
|
||||
case "property":
|
||||
metaProperty = strings.ToLower(attr.Val)
|
||||
case "content":
|
||||
contentVal = attr.Val
|
||||
}
|
||||
}
|
||||
|
||||
// Standard meta tags
|
||||
switch metaName {
|
||||
case "description":
|
||||
desc = contentVal
|
||||
foundDesc = true
|
||||
case "keywords":
|
||||
keywords = contentVal
|
||||
case "twitter:title":
|
||||
twTitle = contentVal
|
||||
case "twitter:description":
|
||||
twDesc = contentVal
|
||||
}
|
||||
|
||||
// Open Graph tags
|
||||
switch metaProperty {
|
||||
case "og:title":
|
||||
ogTitle = contentVal
|
||||
case "og:description":
|
||||
ogDesc = contentVal
|
||||
}
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
walk(c)
|
||||
}
|
||||
}
|
||||
walk(doc)
|
||||
|
||||
// Fallback to OG or Twitter if <title>/description are missing
|
||||
if !foundTitle {
|
||||
if ogTitle != "" {
|
||||
title = ogTitle
|
||||
} else if twTitle != "" {
|
||||
title = twTitle
|
||||
}
|
||||
}
|
||||
if !foundDesc {
|
||||
if ogDesc != "" {
|
||||
desc = ogDesc
|
||||
} else if twDesc != "" {
|
||||
desc = twDesc
|
||||
}
|
||||
}
|
||||
|
||||
// If still missing title or desc, fallback to go-readability
|
||||
if title == "" || desc == "" {
|
||||
parsedURL, parseErr := url.Parse(pageURL)
|
||||
if parseErr != nil {
|
||||
printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
|
||||
// We must skip if we can't parse the URL for readability
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
readResp, readErr := client.Get(pageURL)
|
||||
if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
|
||||
defer readResp.Body.Close()
|
||||
|
||||
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
|
||||
if rdErr == nil {
|
||||
// If we still have no title, try from readability
|
||||
if title == "" && article.Title != "" {
|
||||
title = article.Title
|
||||
}
|
||||
// If we still have no description, try article.Excerpt
|
||||
if desc == "" && article.Excerpt != "" {
|
||||
desc = article.Excerpt
|
||||
} else if desc == "" && len(article.Content) > 0 {
|
||||
// If excerpt is empty, use a snippet from article.Content
|
||||
snippet := article.Content
|
||||
if len(snippet) > 200 {
|
||||
snippet = snippet[:200] + "..."
|
||||
}
|
||||
desc = snippet
|
||||
}
|
||||
} else {
|
||||
printDebug("go-readability failed for %s: %v", pageURL, rdErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Heuristic: discard obviously incorrect HTML-y strings or placeholders
|
||||
if looksLikeRawHTML(title) {
|
||||
title = ""
|
||||
}
|
||||
if looksLikeRawHTML(desc) {
|
||||
desc = ""
|
||||
}
|
||||
|
||||
// If after all that we have no title or description, skip
|
||||
if title == "" || desc == "" {
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
return sanitize(title), sanitize(desc), sanitize(keywords)
|
||||
}
|
||||
|
||||
// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
|
||||
// go-readability noise (e.g., "readability-page-1").
|
||||
func looksLikeRawHTML(text string) bool {
|
||||
textLower := strings.ToLower(text)
|
||||
if strings.Contains(textLower, "readability-page") {
|
||||
return true
|
||||
}
|
||||
if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// sanitize removes pipes and newlines so they don't break our output format.
|
||||
func sanitize(input string) string {
|
||||
input = strings.ReplaceAll(input, "|", " ")
|
||||
input = strings.ReplaceAll(input, "\n", " ")
|
||||
return strings.TrimSpace(input)
|
||||
}
|
100
crawler.go
100
crawler.go
|
@ -3,14 +3,11 @@ package main
|
|||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// webCrawlerInit is called during init on program start
|
||||
|
@ -130,18 +127,18 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
|
|||
mu.Unlock()
|
||||
|
||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||
if title == "" {
|
||||
title = "Unknown Title"
|
||||
}
|
||||
if desc == "" {
|
||||
desc = "No Description"
|
||||
|
||||
// Skip saving if title or description is missing
|
||||
if title == "" || desc == "" {
|
||||
printDebug("Skipping %s: missing title or description", fullURL)
|
||||
return
|
||||
}
|
||||
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL,
|
||||
sanitize(title),
|
||||
sanitize(keywords),
|
||||
sanitize(desc),
|
||||
title,
|
||||
keywords,
|
||||
desc,
|
||||
rank,
|
||||
)
|
||||
file.WriteString(line)
|
||||
|
@ -151,84 +148,3 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
|
|||
wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords]
|
||||
func fetchPageMetadata(pageURL string) (string, string, string) {
|
||||
// Generate a User-Agent using your GetUserAgent function
|
||||
userAgent, err := GetUserAgent("crawler")
|
||||
if err != nil {
|
||||
printWarn("Failed to generate User-Agent: %v", err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 15 * time.Second}
|
||||
req, err := http.NewRequest("GET", pageURL, nil)
|
||||
if err != nil {
|
||||
printWarn("Failed to create request for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
// Set the dynamically generated User-Agent
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
printWarn("Failed to GET %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Handle non-200 responses
|
||||
if resp.StatusCode == 403 || resp.StatusCode == 401 {
|
||||
printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode)
|
||||
return "", "", ""
|
||||
} else if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
// Parse HTML
|
||||
doc, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
printWarn("HTML parse error for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
var title, desc, keywords string
|
||||
var f func(*html.Node)
|
||||
f = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil {
|
||||
title = n.FirstChild.Data
|
||||
}
|
||||
if n.Type == html.ElementNode && n.Data == "meta" {
|
||||
var nameVal, contentVal string
|
||||
for _, attr := range n.Attr {
|
||||
switch strings.ToLower(attr.Key) {
|
||||
case "name":
|
||||
nameVal = strings.ToLower(attr.Val)
|
||||
case "content":
|
||||
contentVal = attr.Val
|
||||
}
|
||||
}
|
||||
if nameVal == "description" {
|
||||
desc = contentVal
|
||||
} else if nameVal == "keywords" {
|
||||
keywords = contentVal
|
||||
}
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
f(c)
|
||||
}
|
||||
}
|
||||
f(doc)
|
||||
|
||||
return title, desc, keywords
|
||||
}
|
||||
|
||||
// sanitize is a quick helper to remove newlines/pipes from fields
|
||||
func sanitize(input string) string {
|
||||
input = strings.ReplaceAll(input, "|", " ")
|
||||
input = strings.ReplaceAll(input, "\n", " ")
|
||||
input = strings.TrimSpace(input)
|
||||
return input
|
||||
}
|
||||
|
|
5
go.mod
5
go.mod
|
@ -15,12 +15,14 @@ require (
|
|||
|
||||
require (
|
||||
github.com/blevesearch/bleve/v2 v2.4.4
|
||||
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f
|
||||
golang.org/x/net v0.33.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/RoaringBitmap/roaring v1.9.4 // indirect
|
||||
github.com/andybalholm/cascadia v1.3.3 // indirect
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect
|
||||
github.com/bits-and-blooms/bitset v1.20.0 // indirect
|
||||
github.com/blevesearch/bleve_index_api v1.2.0 // indirect
|
||||
github.com/blevesearch/geo v0.1.20 // indirect
|
||||
|
@ -40,6 +42,8 @@ require (
|
|||
github.com/blevesearch/zapx/v15 v15.3.17 // indirect
|
||||
github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect
|
||||
github.com/go-ole/go-ole v1.3.0 // indirect
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
|
||||
github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/golang/snappy v0.0.4 // indirect
|
||||
|
@ -51,5 +55,6 @@ require (
|
|||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
go.etcd.io/bbolt v1.3.11 // indirect
|
||||
golang.org/x/sys v0.28.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
google.golang.org/protobuf v1.36.0 // indirect
|
||||
)
|
||||
|
|
14
go.sum
14
go.sum
|
@ -4,6 +4,8 @@ github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv
|
|||
github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
|
||||
github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM=
|
||||
github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA=
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA=
|
||||
github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw=
|
||||
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
github.com/bits-and-blooms/bitset v1.20.0 h1:2F+rfL86jE2d/bmw7OhqUg2Sj/1rURkBn3MdfoPyRVU=
|
||||
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
|
||||
|
@ -51,6 +53,12 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
|
|||
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
|
||||
github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
|
||||
github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w=
|
||||
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
|
||||
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI=
|
||||
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w=
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
|
||||
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
|
||||
github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I=
|
||||
github.com/golang/geo v0.0.0-20230421003525-6adc56603217/go.mod h1:8wI0hitZ3a1IxZfeH3/5I97CI8i5cLGsYe7xNhQGs9U=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
|
@ -64,6 +72,7 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr
|
|||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8=
|
||||
github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw=
|
||||
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
|
@ -73,6 +82,10 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
|
|||
github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg=
|
||||
github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0=
|
||||
github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM=
|
||||
github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI=
|
||||
github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
|
@ -147,6 +160,7 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
|||
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
|
||||
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
|
|
Loading…
Add table
Reference in a new issue