improved crawler data extraction (added chromedp)
This commit is contained in:
parent
3494457336
commit
c71808aa1e
6 changed files with 305 additions and 166 deletions
|
@ -1,69 +1,99 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/chromedp/cdproto/emulation"
|
||||
"github.com/chromedp/chromedp"
|
||||
"github.com/go-shiori/go-readability"
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
|
||||
// OG, Twitter, then falls back to go-readability if needed. If after all that we
|
||||
// still have no title or no description, we return ("", "", "") so the caller
|
||||
// can skip saving it.
|
||||
//
|
||||
// 1. <title>, <meta name="description"/>, <meta name="keywords"/>
|
||||
// 2. <meta property="og:title">, <meta property="og:description">
|
||||
// 3. <meta name="twitter:title">, <meta name="twitter:description">
|
||||
// 4. go-readability fallback (if title or description is still missing)
|
||||
// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
|
||||
func fetchPageMetadata(pageURL string) (string, string, string) {
|
||||
userAgent, err := GetUserAgent("crawler")
|
||||
// fetchPageMetadataStandard tries standard HTML parse + go-readability only.
|
||||
func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
|
||||
// 1. Standard HTML parse
|
||||
title, desc, keywords := extractStandard(pageURL, userAgent)
|
||||
|
||||
// 2. Fallback: go-readability
|
||||
if title == "" || desc == "" {
|
||||
title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
|
||||
}
|
||||
|
||||
// If still empty, return ("", "", "")
|
||||
if title == "" || desc == "" {
|
||||
return "", "", ""
|
||||
}
|
||||
return sanitize(title), sanitize(desc), sanitize(keywords)
|
||||
}
|
||||
|
||||
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
|
||||
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
|
||||
// Create context
|
||||
ctx, cancel := chromedp.NewContext(context.Background())
|
||||
defer cancel()
|
||||
|
||||
var renderedHTML string
|
||||
err := chromedp.Run(ctx,
|
||||
emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
|
||||
chromedp.Navigate(pageURL),
|
||||
chromedp.Sleep(2*time.Second), // Let JS run a bit
|
||||
chromedp.OuterHTML("html", &renderedHTML),
|
||||
)
|
||||
if err != nil {
|
||||
printDebug("Failed to generate User-Agent: %v", err)
|
||||
printDebug("chromedp error for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
doc, err := html.Parse(strings.NewReader(renderedHTML))
|
||||
if err != nil {
|
||||
printDebug("chromedp parse error for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
return extractParsedDOM(doc)
|
||||
}
|
||||
|
||||
// extractStandard does the normal HTML parse with OG, Twitter, etc.
|
||||
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
|
||||
client := &http.Client{Timeout: 15 * time.Second}
|
||||
req, err := http.NewRequest("GET", pageURL, nil)
|
||||
if err != nil {
|
||||
printDebug("Failed to create request for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
return
|
||||
}
|
||||
|
||||
// Force English content when possible
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
printDebug("Failed to GET %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Skip non-2xx
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
|
||||
return "", "", ""
|
||||
return
|
||||
}
|
||||
|
||||
// First pass: standard HTML parse
|
||||
doc, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
printDebug("HTML parse error for %s: %v", pageURL, err)
|
||||
return "", "", ""
|
||||
return
|
||||
}
|
||||
|
||||
var (
|
||||
title, desc, keywords string
|
||||
ogTitle, ogDesc string
|
||||
twTitle, twDesc string
|
||||
foundTitle, foundDesc bool
|
||||
)
|
||||
return extractParsedDOM(doc)
|
||||
}
|
||||
|
||||
// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
|
||||
func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
|
||||
var ogTitle, ogDesc string
|
||||
var twTitle, twDesc string
|
||||
var foundTitle, foundDesc bool
|
||||
|
||||
var walk func(*html.Node)
|
||||
walk = func(n *html.Node) {
|
||||
|
@ -87,7 +117,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
|
|||
}
|
||||
}
|
||||
|
||||
// Standard meta tags
|
||||
switch metaName {
|
||||
case "description":
|
||||
desc = contentVal
|
||||
|
@ -100,7 +129,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
|
|||
twDesc = contentVal
|
||||
}
|
||||
|
||||
// Open Graph tags
|
||||
switch metaProperty {
|
||||
case "og:title":
|
||||
ogTitle = contentVal
|
||||
|
@ -115,7 +143,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
|
|||
}
|
||||
walk(doc)
|
||||
|
||||
// Fallback to OG or Twitter if <title>/description are missing
|
||||
// fallback to OG/Twitter if missing
|
||||
if !foundTitle {
|
||||
if ogTitle != "" {
|
||||
title = ogTitle
|
||||
|
@ -131,43 +159,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
|
|||
}
|
||||
}
|
||||
|
||||
// If still missing title or desc, fallback to go-readability
|
||||
if title == "" || desc == "" {
|
||||
parsedURL, parseErr := url.Parse(pageURL)
|
||||
if parseErr != nil {
|
||||
printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
|
||||
// We must skip if we can't parse the URL for readability
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
readResp, readErr := client.Get(pageURL)
|
||||
if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
|
||||
defer readResp.Body.Close()
|
||||
|
||||
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
|
||||
if rdErr == nil {
|
||||
// If we still have no title, try from readability
|
||||
if title == "" && article.Title != "" {
|
||||
title = article.Title
|
||||
}
|
||||
// If we still have no description, try article.Excerpt
|
||||
if desc == "" && article.Excerpt != "" {
|
||||
desc = article.Excerpt
|
||||
} else if desc == "" && len(article.Content) > 0 {
|
||||
// If excerpt is empty, use a snippet from article.Content
|
||||
snippet := article.Content
|
||||
if len(snippet) > 200 {
|
||||
snippet = snippet[:200] + "..."
|
||||
}
|
||||
desc = snippet
|
||||
}
|
||||
} else {
|
||||
printDebug("go-readability failed for %s: %v", pageURL, rdErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Heuristic: discard obviously incorrect HTML-y strings or placeholders
|
||||
// Heuristic check
|
||||
if looksLikeRawHTML(title) {
|
||||
title = ""
|
||||
}
|
||||
|
@ -175,16 +167,68 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
|
|||
desc = ""
|
||||
}
|
||||
|
||||
// If after all that we have no title or description, skip
|
||||
if title == "" || desc == "" {
|
||||
return "", "", ""
|
||||
}
|
||||
|
||||
return sanitize(title), sanitize(desc), sanitize(keywords)
|
||||
return title, desc, keywords
|
||||
}
|
||||
|
||||
// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
|
||||
// go-readability noise (e.g., "readability-page-1").
|
||||
// fallbackReadability tries go-readability if title/desc is missing.
|
||||
func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
|
||||
if title != "" && desc != "" {
|
||||
return title, desc, keywords
|
||||
}
|
||||
|
||||
client := &http.Client{Timeout: 15 * time.Second}
|
||||
readReq, err := http.NewRequest("GET", pageURL, nil)
|
||||
if err != nil {
|
||||
printDebug("Failed to create fallbackReadability request: %v", err)
|
||||
return title, desc, keywords
|
||||
}
|
||||
readReq.Header.Set("User-Agent", userAgent)
|
||||
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
|
||||
readResp, err := client.Do(readReq)
|
||||
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
|
||||
if err != nil {
|
||||
printDebug("go-readability GET error for %s: %v", pageURL, err)
|
||||
}
|
||||
if readResp != nil {
|
||||
readResp.Body.Close()
|
||||
}
|
||||
return title, desc, keywords
|
||||
}
|
||||
defer readResp.Body.Close()
|
||||
|
||||
parsedURL, parseErr := url.Parse(pageURL)
|
||||
if parseErr != nil {
|
||||
printDebug("Failed to parse URL: %v", parseErr)
|
||||
return title, desc, keywords
|
||||
}
|
||||
|
||||
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
|
||||
if rdErr != nil {
|
||||
printDebug("go-readability error for %s: %v", pageURL, rdErr)
|
||||
return title, desc, keywords
|
||||
}
|
||||
|
||||
if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
|
||||
title = article.Title
|
||||
}
|
||||
if desc == "" {
|
||||
if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
|
||||
desc = article.Excerpt
|
||||
} else if len(article.Content) > 0 {
|
||||
snippet := article.Content
|
||||
if len(snippet) > 200 {
|
||||
snippet = snippet[:200] + "..."
|
||||
}
|
||||
if !looksLikeRawHTML(snippet) {
|
||||
desc = snippet
|
||||
}
|
||||
}
|
||||
}
|
||||
return title, desc, keywords
|
||||
}
|
||||
|
||||
// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
|
||||
func looksLikeRawHTML(text string) bool {
|
||||
textLower := strings.ToLower(text)
|
||||
if strings.Contains(textLower, "readability-page") {
|
||||
|
@ -196,7 +240,7 @@ func looksLikeRawHTML(text string) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
// sanitize removes pipes and newlines so they don't break our output format.
|
||||
// sanitize removes pipes/newlines so they don't break our output format.
|
||||
func sanitize(input string) string {
|
||||
input = strings.ReplaceAll(input, "|", " ")
|
||||
input = strings.ReplaceAll(input, "\n", " ")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue