improved crawler data extraction (added chromedp)

2025-01-01 14:50:12 +01:00 · 2025-01-01 14:50:12 +01:00 · c71808aa1e
commit c71808aa1e
parent 3494457336
6 changed files with 305 additions and 166 deletions
--- a/crawler-extraction.go
+++ b/crawler-extraction.go
@ -1,69 +1,99 @@
 package main

 import (
+	"context"
 	"net/http"
 	"net/url"
 	"strings"
 	"time"

+	"github.com/chromedp/cdproto/emulation"
+	"github.com/chromedp/chromedp"
 	"github.com/go-shiori/go-readability"
 	"golang.org/x/net/html"
 )

-// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
-// OG, Twitter, then falls back to go-readability if needed. If after all that we
-// still have no title or no description, we return ("", "", "") so the caller
-// can skip saving it.
-//
-//  1. <title>, <meta name="description"/>, <meta name="keywords"/>
-//  2. <meta property="og:title">, <meta property="og:description">
-//  3. <meta name="twitter:title">, <meta name="twitter:description">
-//  4. go-readability fallback (if title or description is still missing)
-//  5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
-func fetchPageMetadata(pageURL string) (string, string, string) {
-	userAgent, err := GetUserAgent("crawler")
+// fetchPageMetadataStandard tries standard HTML parse + go-readability only.
+func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
+	// 1. Standard HTML parse
+	title, desc, keywords := extractStandard(pageURL, userAgent)
+
+	// 2. Fallback: go-readability
+	if title == "" || desc == "" {
+		title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
+	}
+
+	// If still empty, return ("", "", "")
+	if title == "" || desc == "" {
+		return "", "", ""
+	}
+	return sanitize(title), sanitize(desc), sanitize(keywords)
+}
+
+// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
+func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
+	// Create context
+	ctx, cancel := chromedp.NewContext(context.Background())
+	defer cancel()
+
+	var renderedHTML string
+	err := chromedp.Run(ctx,
+		emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
+		chromedp.Navigate(pageURL),
+		chromedp.Sleep(2*time.Second), // Let JS run a bit
+		chromedp.OuterHTML("html", &renderedHTML),
+	)
 	if err != nil {
-		printDebug("Failed to generate User-Agent: %v", err)
+		printDebug("chromedp error for %s: %v", pageURL, err)
 		return "", "", ""
 	}

+	doc, err := html.Parse(strings.NewReader(renderedHTML))
+	if err != nil {
+		printDebug("chromedp parse error for %s: %v", pageURL, err)
+		return "", "", ""
+	}
+
+	return extractParsedDOM(doc)
+}
+
+// extractStandard does the normal HTML parse with OG, Twitter, etc.
+func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
 	client := &http.Client{Timeout: 15 * time.Second}
 	req, err := http.NewRequest("GET", pageURL, nil)
 	if err != nil {
 		printDebug("Failed to create request for %s: %v", pageURL, err)
-		return "", "", ""
+		return
 	}
-
-	// Force English content when possible
 	req.Header.Set("User-Agent", userAgent)
 	req.Header.Set("Accept-Language", "en-US,en;q=0.9")

 	resp, err := client.Do(req)
 	if err != nil {
 		printDebug("Failed to GET %s: %v", pageURL, err)
-		return "", "", ""
+		return
 	}
 	defer resp.Body.Close()

-	// Skip non-2xx
 	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
 		printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
-		return "", "", ""
+		return
 	}

-	// First pass: standard HTML parse
 	doc, err := html.Parse(resp.Body)
 	if err != nil {
 		printDebug("HTML parse error for %s: %v", pageURL, err)
-		return "", "", ""
+		return
 	}

-	var (
-		title, desc, keywords string
-		ogTitle, ogDesc       string
-		twTitle, twDesc       string
-		foundTitle, foundDesc bool
-	)
+	return extractParsedDOM(doc)
+}
+
+// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
+func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
+	var ogTitle, ogDesc string
+	var twTitle, twDesc string
+	var foundTitle, foundDesc bool

 	var walk func(*html.Node)
 	walk = func(n *html.Node) {
@ -87,7 +117,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
 					}
 				}

-				// Standard meta tags
 				switch metaName {
 				case "description":
 					desc = contentVal
@ -100,7 +129,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
 					twDesc = contentVal
 				}

-				// Open Graph tags
 				switch metaProperty {
 				case "og:title":
 					ogTitle = contentVal
@ -115,7 +143,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
 	}
 	walk(doc)

-	// Fallback to OG or Twitter if <title>/description are missing
+	// fallback to OG/Twitter if missing
 	if !foundTitle {
 		if ogTitle != "" {
 			title = ogTitle
@ -131,43 +159,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
 		}
 	}

-	// If still missing title or desc, fallback to go-readability
-	if title == "" || desc == "" {
-		parsedURL, parseErr := url.Parse(pageURL)
-		if parseErr != nil {
-			printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
-			// We must skip if we can't parse the URL for readability
-			return "", "", ""
-		}
-
-		readResp, readErr := client.Get(pageURL)
-		if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
-			defer readResp.Body.Close()
-
-			article, rdErr := readability.FromReader(readResp.Body, parsedURL)
-			if rdErr == nil {
-				// If we still have no title, try from readability
-				if title == "" && article.Title != "" {
-					title = article.Title
-				}
-				// If we still have no description, try article.Excerpt
-				if desc == "" && article.Excerpt != "" {
-					desc = article.Excerpt
-				} else if desc == "" && len(article.Content) > 0 {
-					// If excerpt is empty, use a snippet from article.Content
-					snippet := article.Content
-					if len(snippet) > 200 {
-						snippet = snippet[:200] + "..."
-					}
-					desc = snippet
-				}
-			} else {
-				printDebug("go-readability failed for %s: %v", pageURL, rdErr)
-			}
-		}
-	}
-
-	// Heuristic: discard obviously incorrect HTML-y strings or placeholders
+	// Heuristic check
 	if looksLikeRawHTML(title) {
 		title = ""
 	}
@ -175,16 +167,68 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
 		desc = ""
 	}

-	// If after all that we have no title or description, skip
-	if title == "" || desc == "" {
-		return "", "", ""
-	}
-
-	return sanitize(title), sanitize(desc), sanitize(keywords)
+	return title, desc, keywords
 }

-// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
-// go-readability noise (e.g., "readability-page-1").
+// fallbackReadability tries go-readability if title/desc is missing.
+func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
+	if title != "" && desc != "" {
+		return title, desc, keywords
+	}
+
+	client := &http.Client{Timeout: 15 * time.Second}
+	readReq, err := http.NewRequest("GET", pageURL, nil)
+	if err != nil {
+		printDebug("Failed to create fallbackReadability request: %v", err)
+		return title, desc, keywords
+	}
+	readReq.Header.Set("User-Agent", userAgent)
+	readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
+
+	readResp, err := client.Do(readReq)
+	if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
+		if err != nil {
+			printDebug("go-readability GET error for %s: %v", pageURL, err)
+		}
+		if readResp != nil {
+			readResp.Body.Close()
+		}
+		return title, desc, keywords
+	}
+	defer readResp.Body.Close()
+
+	parsedURL, parseErr := url.Parse(pageURL)
+	if parseErr != nil {
+		printDebug("Failed to parse URL: %v", parseErr)
+		return title, desc, keywords
+	}
+
+	article, rdErr := readability.FromReader(readResp.Body, parsedURL)
+	if rdErr != nil {
+		printDebug("go-readability error for %s: %v", pageURL, rdErr)
+		return title, desc, keywords
+	}
+
+	if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
+		title = article.Title
+	}
+	if desc == "" {
+		if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
+			desc = article.Excerpt
+		} else if len(article.Content) > 0 {
+			snippet := article.Content
+			if len(snippet) > 200 {
+				snippet = snippet[:200] + "..."
+			}
+			if !looksLikeRawHTML(snippet) {
+				desc = snippet
+			}
+		}
+	}
+	return title, desc, keywords
+}
+
+// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
 func looksLikeRawHTML(text string) bool {
 	textLower := strings.ToLower(text)
 	if strings.Contains(textLower, "readability-page") {
@ -196,7 +240,7 @@ func looksLikeRawHTML(text string) bool {
 	return false
 }

-// sanitize removes pipes and newlines so they don't break our output format.
+// sanitize removes pipes/newlines so they don't break our output format.
 func sanitize(input string) string {
 	input = strings.ReplaceAll(input, "|", " ")
 	input = strings.ReplaceAll(input, "\n", " ")