improved crawler data extraction (added chromedp)

This commit is contained in:
partisan 2025-01-01 14:50:12 +01:00
parent 3494457336
commit c71808aa1e
6 changed files with 305 additions and 166 deletions

View file

@ -1,69 +1,99 @@
package main
import (
"context"
"net/http"
"net/url"
"strings"
"time"
"github.com/chromedp/cdproto/emulation"
"github.com/chromedp/chromedp"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
)
// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
// OG, Twitter, then falls back to go-readability if needed. If after all that we
// still have no title or no description, we return ("", "", "") so the caller
// can skip saving it.
//
// 1. <title>, <meta name="description"/>, <meta name="keywords"/>
// 2. <meta property="og:title">, <meta property="og:description">
// 3. <meta name="twitter:title">, <meta name="twitter:description">
// 4. go-readability fallback (if title or description is still missing)
// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
func fetchPageMetadata(pageURL string) (string, string, string) {
userAgent, err := GetUserAgent("crawler")
// fetchPageMetadataStandard tries standard HTML parse + go-readability only.
func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
// 1. Standard HTML parse
title, desc, keywords := extractStandard(pageURL, userAgent)
// 2. Fallback: go-readability
if title == "" || desc == "" {
title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
}
// If still empty, return ("", "", "")
if title == "" || desc == "" {
return "", "", ""
}
return sanitize(title), sanitize(desc), sanitize(keywords)
}
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
// Create context
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
var renderedHTML string
err := chromedp.Run(ctx,
emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
chromedp.Navigate(pageURL),
chromedp.Sleep(2*time.Second), // Let JS run a bit
chromedp.OuterHTML("html", &renderedHTML),
)
if err != nil {
printDebug("Failed to generate User-Agent: %v", err)
printDebug("chromedp error for %s: %v", pageURL, err)
return "", "", ""
}
doc, err := html.Parse(strings.NewReader(renderedHTML))
if err != nil {
printDebug("chromedp parse error for %s: %v", pageURL, err)
return "", "", ""
}
return extractParsedDOM(doc)
}
// extractStandard does the normal HTML parse with OG, Twitter, etc.
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create request for %s: %v", pageURL, err)
return "", "", ""
return
}
// Force English content when possible
req.Header.Set("User-Agent", userAgent)
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := client.Do(req)
if err != nil {
printDebug("Failed to GET %s: %v", pageURL, err)
return "", "", ""
return
}
defer resp.Body.Close()
// Skip non-2xx
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
return "", "", ""
return
}
// First pass: standard HTML parse
doc, err := html.Parse(resp.Body)
if err != nil {
printDebug("HTML parse error for %s: %v", pageURL, err)
return "", "", ""
return
}
var (
title, desc, keywords string
ogTitle, ogDesc string
twTitle, twDesc string
foundTitle, foundDesc bool
)
return extractParsedDOM(doc)
}
// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
var ogTitle, ogDesc string
var twTitle, twDesc string
var foundTitle, foundDesc bool
var walk func(*html.Node)
walk = func(n *html.Node) {
@ -87,7 +117,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
}
}
// Standard meta tags
switch metaName {
case "description":
desc = contentVal
@ -100,7 +129,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
twDesc = contentVal
}
// Open Graph tags
switch metaProperty {
case "og:title":
ogTitle = contentVal
@ -115,7 +143,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
}
walk(doc)
// Fallback to OG or Twitter if <title>/description are missing
// fallback to OG/Twitter if missing
if !foundTitle {
if ogTitle != "" {
title = ogTitle
@ -131,43 +159,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
}
}
// If still missing title or desc, fallback to go-readability
if title == "" || desc == "" {
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
// We must skip if we can't parse the URL for readability
return "", "", ""
}
readResp, readErr := client.Get(pageURL)
if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
defer readResp.Body.Close()
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr == nil {
// If we still have no title, try from readability
if title == "" && article.Title != "" {
title = article.Title
}
// If we still have no description, try article.Excerpt
if desc == "" && article.Excerpt != "" {
desc = article.Excerpt
} else if desc == "" && len(article.Content) > 0 {
// If excerpt is empty, use a snippet from article.Content
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
desc = snippet
}
} else {
printDebug("go-readability failed for %s: %v", pageURL, rdErr)
}
}
}
// Heuristic: discard obviously incorrect HTML-y strings or placeholders
// Heuristic check
if looksLikeRawHTML(title) {
title = ""
}
@ -175,16 +167,68 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
desc = ""
}
// If after all that we have no title or description, skip
if title == "" || desc == "" {
return "", "", ""
}
return sanitize(title), sanitize(desc), sanitize(keywords)
return title, desc, keywords
}
// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
// go-readability noise (e.g., "readability-page-1").
// fallbackReadability tries go-readability if title/desc is missing.
func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
if title != "" && desc != "" {
return title, desc, keywords
}
client := &http.Client{Timeout: 15 * time.Second}
readReq, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create fallbackReadability request: %v", err)
return title, desc, keywords
}
readReq.Header.Set("User-Agent", userAgent)
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
readResp, err := client.Do(readReq)
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
if err != nil {
printDebug("go-readability GET error for %s: %v", pageURL, err)
}
if readResp != nil {
readResp.Body.Close()
}
return title, desc, keywords
}
defer readResp.Body.Close()
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL: %v", parseErr)
return title, desc, keywords
}
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr != nil {
printDebug("go-readability error for %s: %v", pageURL, rdErr)
return title, desc, keywords
}
if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
title = article.Title
}
if desc == "" {
if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
desc = article.Excerpt
} else if len(article.Content) > 0 {
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
if !looksLikeRawHTML(snippet) {
desc = snippet
}
}
}
return title, desc, keywords
}
// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
func looksLikeRawHTML(text string) bool {
textLower := strings.ToLower(text)
if strings.Contains(textLower, "readability-page") {
@ -196,7 +240,7 @@ func looksLikeRawHTML(text string) bool {
return false
}
// sanitize removes pipes and newlines so they don't break our output format.
// sanitize removes pipes/newlines so they don't break our output format.
func sanitize(input string) string {
input = strings.ReplaceAll(input, "|", " ")
input = strings.ReplaceAll(input, "\n", " ")