diff --git a/crawler-extraction.go b/crawler-extraction.go new file mode 100644 index 0000000..1594bef --- /dev/null +++ b/crawler-extraction.go @@ -0,0 +1,204 @@ +package main + +import ( + "net/http" + "net/url" + "strings" + "time" + + "github.com/go-shiori/go-readability" + "golang.org/x/net/html" +) + +// fetchPageMetadata tries extracting title/description/keywords from standard HTML, +// OG, Twitter, then falls back to go-readability if needed. If after all that we +// still have no title or no description, we return ("", "", "") so the caller +// can skip saving it. +// +// 1. , <meta name="description"/>, <meta name="keywords"/> +// 2. <meta property="og:title">, <meta property="og:description"> +// 3. <meta name="twitter:title">, <meta name="twitter:description"> +// 4. go-readability fallback (if title or description is still missing) +// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”). +func fetchPageMetadata(pageURL string) (string, string, string) { + userAgent, err := GetUserAgent("crawler") + if err != nil { + printDebug("Failed to generate User-Agent: %v", err) + return "", "", "" + } + + client := &http.Client{Timeout: 15 * time.Second} + req, err := http.NewRequest("GET", pageURL, nil) + if err != nil { + printDebug("Failed to create request for %s: %v", pageURL, err) + return "", "", "" + } + + // Force English content when possible + req.Header.Set("User-Agent", userAgent) + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + + resp, err := client.Do(req) + if err != nil { + printDebug("Failed to GET %s: %v", pageURL, err) + return "", "", "" + } + defer resp.Body.Close() + + // Skip non-2xx + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) + return "", "", "" + } + + // First pass: standard HTML parse + doc, err := html.Parse(resp.Body) + if err != nil { + printDebug("HTML parse error for %s: %v", pageURL, err) + return "", "", "" + } + + var ( + title, desc, keywords string + ogTitle, ogDesc string + twTitle, twDesc string + foundTitle, foundDesc bool + ) + + var walk func(*html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode { + switch strings.ToLower(n.Data) { + case "title": + if n.FirstChild != nil { + title = n.FirstChild.Data + foundTitle = true + } + case "meta": + var metaName, metaProperty, contentVal string + for _, attr := range n.Attr { + switch strings.ToLower(attr.Key) { + case "name": + metaName = strings.ToLower(attr.Val) + case "property": + metaProperty = strings.ToLower(attr.Val) + case "content": + contentVal = attr.Val + } + } + + // Standard meta tags + switch metaName { + case "description": + desc = contentVal + foundDesc = true + case "keywords": + keywords = contentVal + case "twitter:title": + twTitle = contentVal + case "twitter:description": + twDesc = contentVal + } + + // Open Graph tags + switch metaProperty { + case "og:title": + ogTitle = contentVal + case "og:description": + ogDesc = contentVal + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + walk(doc) + + // Fallback to OG or Twitter if <title>/description are missing + if !foundTitle { + if ogTitle != "" { + title = ogTitle + } else if twTitle != "" { + title = twTitle + } + } + if !foundDesc { + if ogDesc != "" { + desc = ogDesc + } else if twDesc != "" { + desc = twDesc + } + } + + // If still missing title or desc, fallback to go-readability + if title == "" || desc == "" { + parsedURL, parseErr := url.Parse(pageURL) + if parseErr != nil { + printDebug("Failed to parse URL %s: %v", pageURL, parseErr) + // We must skip if we can't parse the URL for readability + return "", "", "" + } + + readResp, readErr := client.Get(pageURL) + if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 { + defer readResp.Body.Close() + + article, rdErr := readability.FromReader(readResp.Body, parsedURL) + if rdErr == nil { + // If we still have no title, try from readability + if title == "" && article.Title != "" { + title = article.Title + } + // If we still have no description, try article.Excerpt + if desc == "" && article.Excerpt != "" { + desc = article.Excerpt + } else if desc == "" && len(article.Content) > 0 { + // If excerpt is empty, use a snippet from article.Content + snippet := article.Content + if len(snippet) > 200 { + snippet = snippet[:200] + "..." + } + desc = snippet + } + } else { + printDebug("go-readability failed for %s: %v", pageURL, rdErr) + } + } + } + + // Heuristic: discard obviously incorrect HTML-y strings or placeholders + if looksLikeRawHTML(title) { + title = "" + } + if looksLikeRawHTML(desc) { + desc = "" + } + + // If after all that we have no title or description, skip + if title == "" || desc == "" { + return "", "", "" + } + + return sanitize(title), sanitize(desc), sanitize(keywords) +} + +// looksLikeRawHTML is a simple heuristic to check for leftover HTML or +// go-readability noise (e.g., "readability-page-1"). +func looksLikeRawHTML(text string) bool { + textLower := strings.ToLower(text) + if strings.Contains(textLower, "readability-page") { + return true + } + if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 { + return true + } + return false +} + +// sanitize removes pipes and newlines so they don't break our output format. +func sanitize(input string) string { + input = strings.ReplaceAll(input, "|", " ") + input = strings.ReplaceAll(input, "\n", " ") + return strings.TrimSpace(input) +} diff --git a/crawler.go b/crawler.go index fbb5b5e..2a934f6 100644 --- a/crawler.go +++ b/crawler.go @@ -3,14 +3,11 @@ package main import ( "bufio" "fmt" - "net/http" "os" "path/filepath" "strings" "sync" "time" - - "golang.org/x/net/html" ) // webCrawlerInit is called during init on program start @@ -130,18 +127,18 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu mu.Unlock() title, desc, keywords := fetchPageMetadata(fullURL) - if title == "" { - title = "Unknown Title" - } - if desc == "" { - desc = "No Description" + + // Skip saving if title or description is missing + if title == "" || desc == "" { + printDebug("Skipping %s: missing title or description", fullURL) + return } line := fmt.Sprintf("%s|%s|%s|%s|%s\n", fullURL, - sanitize(title), - sanitize(keywords), - sanitize(desc), + title, + keywords, + desc, rank, ) file.WriteString(line) @@ -151,84 +148,3 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu wg.Wait() return nil } - -// fetchPageMetadata does a simple GET and parses <title>, meta[name=description], meta[name=keywords] -func fetchPageMetadata(pageURL string) (string, string, string) { - // Generate a User-Agent using your GetUserAgent function - userAgent, err := GetUserAgent("crawler") - if err != nil { - printWarn("Failed to generate User-Agent: %v", err) - return "", "", "" - } - - client := &http.Client{Timeout: 15 * time.Second} - req, err := http.NewRequest("GET", pageURL, nil) - if err != nil { - printWarn("Failed to create request for %s: %v", pageURL, err) - return "", "", "" - } - - // Set the dynamically generated User-Agent - req.Header.Set("User-Agent", userAgent) - - resp, err := client.Do(req) - if err != nil { - printWarn("Failed to GET %s: %v", pageURL, err) - return "", "", "" - } - defer resp.Body.Close() - - // Handle non-200 responses - if resp.StatusCode == 403 || resp.StatusCode == 401 { - printWarn("Skipping %s: HTTP %d", pageURL, resp.StatusCode) - return "", "", "" - } else if resp.StatusCode < 200 || resp.StatusCode >= 300 { - printWarn("Non-200 for %s: %d", pageURL, resp.StatusCode) - return "", "", "" - } - - // Parse HTML - doc, err := html.Parse(resp.Body) - if err != nil { - printWarn("HTML parse error for %s: %v", pageURL, err) - return "", "", "" - } - - var title, desc, keywords string - var f func(*html.Node) - f = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == "title" && n.FirstChild != nil { - title = n.FirstChild.Data - } - if n.Type == html.ElementNode && n.Data == "meta" { - var nameVal, contentVal string - for _, attr := range n.Attr { - switch strings.ToLower(attr.Key) { - case "name": - nameVal = strings.ToLower(attr.Val) - case "content": - contentVal = attr.Val - } - } - if nameVal == "description" { - desc = contentVal - } else if nameVal == "keywords" { - keywords = contentVal - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } - } - f(doc) - - return title, desc, keywords -} - -// sanitize is a quick helper to remove newlines/pipes from fields -func sanitize(input string) string { - input = strings.ReplaceAll(input, "|", " ") - input = strings.ReplaceAll(input, "\n", " ") - input = strings.TrimSpace(input) - return input -} diff --git a/go.mod b/go.mod index 6895586..a293a75 100644 --- a/go.mod +++ b/go.mod @@ -15,12 +15,14 @@ require ( require ( github.com/blevesearch/bleve/v2 v2.4.4 + github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f golang.org/x/net v0.33.0 ) require ( github.com/RoaringBitmap/roaring v1.9.4 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect + github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de // indirect github.com/bits-and-blooms/bitset v1.20.0 // indirect github.com/blevesearch/bleve_index_api v1.2.0 // indirect github.com/blevesearch/geo v0.1.20 // indirect @@ -40,6 +42,8 @@ require ( github.com/blevesearch/zapx/v15 v15.3.17 // indirect github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect github.com/go-ole/go-ole v1.3.0 // indirect + github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.4 // indirect @@ -51,5 +55,6 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect go.etcd.io/bbolt v1.3.11 // indirect golang.org/x/sys v0.28.0 // indirect + golang.org/x/text v0.21.0 // indirect google.golang.org/protobuf v1.36.0 // indirect ) diff --git a/go.sum b/go.sum index f3f643b..59414b4 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de h1:FxWPpzIjnTlhPwqqXc4/vE0f7GvRjuAsbW+HOIe8KnA= +github.com/araddon/dateparse v0.0.0-20210429162001-6b43995a97de/go.mod h1:DCaWoUhZrYW9p1lxo/cm8EmUOOzAPSEZNGF2DK1dJgw= github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/bits-and-blooms/bitset v1.20.0 h1:2F+rfL86jE2d/bmw7OhqUg2Sj/1rURkBn3MdfoPyRVU= github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= @@ -51,6 +53,12 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziHZixGO5ZBS73cKqVzZipfrLmO1w= +github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= +github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI= +github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= +github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I= github.com/golang/geo v0.0.0-20230421003525-6adc56603217/go.mod h1:8wI0hitZ3a1IxZfeH3/5I97CI8i5cLGsYe7xNhQGs9U= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= @@ -64,6 +72,7 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= +github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -73,6 +82,10 @@ github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM= github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/scylladb/termtables v0.0.0-20191203121021-c4c0b6d42ff4/go.mod h1:C1a7PQSMz9NShzorzCiG2fk9+xuCgLkPeCvMHYR2OWg= +github.com/sergi/go-diff v1.1.0 h1:we8PVUC3FE2uYfodKH/nBHMSetSfHDR6scGdBi+erh0= +github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI= github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -147,6 +160,7 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=