290 lines
7.7 KiB
Go
290 lines
7.7 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/chromedp/cdproto/emulation"
|
|
"github.com/chromedp/chromedp"
|
|
"github.com/go-shiori/go-readability"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
// fetchPageMetadataStandard tries standard HTML parse + go-readability only.
|
|
func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
|
|
// 1. Standard HTML parse
|
|
title, desc, keywords := extractStandard(pageURL, userAgent)
|
|
|
|
// 2. Fallback: go-readability
|
|
if title == "" || desc == "" {
|
|
title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
|
|
}
|
|
|
|
// If still empty, return ("", "", "")
|
|
if title == "" || desc == "" {
|
|
return "", "", ""
|
|
}
|
|
return sanitize(title), sanitize(desc), sanitize(keywords)
|
|
}
|
|
|
|
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
|
|
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
|
|
// Create a custom allocator context for Chromedp with proxy support if enabled
|
|
allocCtx, cancelAlloc := chromedp.NewExecAllocator(context.Background(), configureChromeOptions()...)
|
|
defer cancelAlloc()
|
|
|
|
// Create a browser context
|
|
ctx, cancel := chromedp.NewContext(allocCtx)
|
|
defer cancel()
|
|
|
|
var renderedHTML string
|
|
err := chromedp.Run(ctx,
|
|
emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
|
|
chromedp.Navigate(pageURL),
|
|
chromedp.Sleep(2*time.Second), // Let JS run a bit
|
|
chromedp.OuterHTML("html", &renderedHTML),
|
|
)
|
|
if err != nil {
|
|
printDebug("chromedp error for %s: %v", pageURL, err)
|
|
return "", "", ""
|
|
}
|
|
|
|
doc, err := html.Parse(strings.NewReader(renderedHTML))
|
|
if err != nil {
|
|
printDebug("chromedp parse error for %s: %v", pageURL, err)
|
|
return "", "", ""
|
|
}
|
|
|
|
return extractParsedDOM(doc)
|
|
}
|
|
|
|
// configureChromeOptions sets up Chrome options and proxy if CrawlerProxy is enabled.
|
|
func configureChromeOptions() []chromedp.ExecAllocatorOption {
|
|
options := chromedp.DefaultExecAllocatorOptions[:]
|
|
|
|
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
|
|
// Retrieve proxy settings from CrawlerProxy
|
|
proxy := crawlerProxyClient.GetProxy() // Ensure a `GetProxy` method is implemented for your proxy client
|
|
if proxy != "" {
|
|
options = append(options, chromedp.ProxyServer(proxy))
|
|
printDebug("Using CrawlerProxy for Chromedp: %s", proxy)
|
|
} else {
|
|
printWarn("CrawlerProxy is enabled but no valid proxy is available")
|
|
}
|
|
}
|
|
|
|
// // Add additional Chrome
|
|
// options = append(options,
|
|
// chromedp.Flag("headless", true),
|
|
// chromedp.Flag("disable-gpu", true),
|
|
// chromedp.Flag("no-sandbox", true),
|
|
// chromedp.Flag("disable-setuid-sandbox", true),
|
|
// )
|
|
|
|
return options
|
|
}
|
|
|
|
// extractStandard does the normal HTML parse with OG, Twitter, etc.
|
|
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
|
|
client := &http.Client{Timeout: 15 * time.Second}
|
|
req, err := http.NewRequest("GET", pageURL, nil)
|
|
if err != nil {
|
|
printDebug("Failed to create request for %s: %v", pageURL, err)
|
|
return
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
|
|
|
// Use CrawlerProxy if enabled
|
|
var resp *http.Response
|
|
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
|
|
resp, err = crawlerProxyClient.Do(req)
|
|
} else {
|
|
resp, err = client.Do(req)
|
|
}
|
|
if err != nil {
|
|
printDebug("Failed to GET %s: %v", pageURL, err)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
|
|
return
|
|
}
|
|
|
|
doc, err := html.Parse(resp.Body)
|
|
if err != nil {
|
|
printDebug("HTML parse error for %s: %v", pageURL, err)
|
|
return
|
|
}
|
|
|
|
return extractParsedDOM(doc)
|
|
}
|
|
|
|
// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
|
|
func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
|
|
var ogTitle, ogDesc string
|
|
var twTitle, twDesc string
|
|
var foundTitle, foundDesc bool
|
|
|
|
var walk func(*html.Node)
|
|
walk = func(n *html.Node) {
|
|
if n.Type == html.ElementNode {
|
|
switch strings.ToLower(n.Data) {
|
|
case "title":
|
|
if n.FirstChild != nil {
|
|
title = n.FirstChild.Data
|
|
foundTitle = true
|
|
}
|
|
case "meta":
|
|
var metaName, metaProperty, contentVal string
|
|
for _, attr := range n.Attr {
|
|
switch strings.ToLower(attr.Key) {
|
|
case "name":
|
|
metaName = strings.ToLower(attr.Val)
|
|
case "property":
|
|
metaProperty = strings.ToLower(attr.Val)
|
|
case "content":
|
|
contentVal = attr.Val
|
|
}
|
|
}
|
|
|
|
switch metaName {
|
|
case "description":
|
|
desc = contentVal
|
|
foundDesc = true
|
|
case "keywords":
|
|
keywords = contentVal
|
|
case "twitter:title":
|
|
twTitle = contentVal
|
|
case "twitter:description":
|
|
twDesc = contentVal
|
|
}
|
|
|
|
switch metaProperty {
|
|
case "og:title":
|
|
ogTitle = contentVal
|
|
case "og:description":
|
|
ogDesc = contentVal
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
walk(c)
|
|
}
|
|
}
|
|
walk(doc)
|
|
|
|
// fallback to OG/Twitter if missing
|
|
if !foundTitle {
|
|
if ogTitle != "" {
|
|
title = ogTitle
|
|
} else if twTitle != "" {
|
|
title = twTitle
|
|
}
|
|
}
|
|
if !foundDesc {
|
|
if ogDesc != "" {
|
|
desc = ogDesc
|
|
} else if twDesc != "" {
|
|
desc = twDesc
|
|
}
|
|
}
|
|
|
|
// Heuristic check
|
|
if looksLikeRawHTML(title) {
|
|
title = ""
|
|
}
|
|
if looksLikeRawHTML(desc) {
|
|
desc = ""
|
|
}
|
|
|
|
return title, desc, keywords
|
|
}
|
|
|
|
// fallbackReadability tries go-readability if title/desc is missing.
|
|
func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
|
|
if title != "" && desc != "" {
|
|
return title, desc, keywords
|
|
}
|
|
|
|
client := &http.Client{Timeout: 15 * time.Second}
|
|
readReq, err := http.NewRequest("GET", pageURL, nil)
|
|
if err != nil {
|
|
printDebug("Failed to create fallbackReadability request: %v", err)
|
|
return title, desc, keywords
|
|
}
|
|
readReq.Header.Set("User-Agent", userAgent)
|
|
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
|
|
|
// Use CrawlerProxy if enabled
|
|
var readResp *http.Response
|
|
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
|
|
readResp, err = crawlerProxyClient.Do(readReq)
|
|
} else {
|
|
readResp, err = client.Do(readReq)
|
|
}
|
|
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
|
|
if err != nil {
|
|
printDebug("go-readability GET error for %s: %v", pageURL, err)
|
|
}
|
|
if readResp != nil {
|
|
readResp.Body.Close()
|
|
}
|
|
return title, desc, keywords
|
|
}
|
|
defer readResp.Body.Close()
|
|
|
|
parsedURL, parseErr := url.Parse(pageURL)
|
|
if parseErr != nil {
|
|
printDebug("Failed to parse URL: %v", parseErr)
|
|
return title, desc, keywords
|
|
}
|
|
|
|
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
|
|
if rdErr != nil {
|
|
printDebug("go-readability error for %s: %v", pageURL, rdErr)
|
|
return title, desc, keywords
|
|
}
|
|
|
|
if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
|
|
title = article.Title
|
|
}
|
|
if desc == "" {
|
|
if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
|
|
desc = article.Excerpt
|
|
} else if len(article.Content) > 0 {
|
|
snippet := article.Content
|
|
if len(snippet) > 200 {
|
|
snippet = snippet[:200] + "..."
|
|
}
|
|
if !looksLikeRawHTML(snippet) {
|
|
desc = snippet
|
|
}
|
|
}
|
|
}
|
|
return title, desc, keywords
|
|
}
|
|
|
|
// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
|
|
func looksLikeRawHTML(text string) bool {
|
|
textLower := strings.ToLower(text)
|
|
if strings.Contains(textLower, "readability-page") {
|
|
return true
|
|
}
|
|
if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// sanitize removes pipes/newlines so they don't break our output format.
|
|
func sanitize(input string) string {
|
|
input = strings.ReplaceAll(input, "|", " ")
|
|
input = strings.ReplaceAll(input, "\n", " ")
|
|
return strings.TrimSpace(input)
|
|
}
|