Search/crawler-extraction.go
partisan 614ce8903e
All checks were successful
Run Integration Tests / test (push) Successful in 33s
added SOCKS5 proxy support
2025-01-12 16:46:52 +01:00

290 lines
7.7 KiB
Go

package main
import (
"context"
"net/http"
"net/url"
"strings"
"time"
"github.com/chromedp/cdproto/emulation"
"github.com/chromedp/chromedp"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
)
// fetchPageMetadataStandard tries standard HTML parse + go-readability only.
func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
// 1. Standard HTML parse
title, desc, keywords := extractStandard(pageURL, userAgent)
// 2. Fallback: go-readability
if title == "" || desc == "" {
title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
}
// If still empty, return ("", "", "")
if title == "" || desc == "" {
return "", "", ""
}
return sanitize(title), sanitize(desc), sanitize(keywords)
}
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
// Create a custom allocator context for Chromedp with proxy support if enabled
allocCtx, cancelAlloc := chromedp.NewExecAllocator(context.Background(), configureChromeOptions()...)
defer cancelAlloc()
// Create a browser context
ctx, cancel := chromedp.NewContext(allocCtx)
defer cancel()
var renderedHTML string
err := chromedp.Run(ctx,
emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
chromedp.Navigate(pageURL),
chromedp.Sleep(2*time.Second), // Let JS run a bit
chromedp.OuterHTML("html", &renderedHTML),
)
if err != nil {
printDebug("chromedp error for %s: %v", pageURL, err)
return "", "", ""
}
doc, err := html.Parse(strings.NewReader(renderedHTML))
if err != nil {
printDebug("chromedp parse error for %s: %v", pageURL, err)
return "", "", ""
}
return extractParsedDOM(doc)
}
// configureChromeOptions sets up Chrome options and proxy if CrawlerProxy is enabled.
func configureChromeOptions() []chromedp.ExecAllocatorOption {
options := chromedp.DefaultExecAllocatorOptions[:]
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
// Retrieve proxy settings from CrawlerProxy
proxy := crawlerProxyClient.GetProxy() // Ensure a `GetProxy` method is implemented for your proxy client
if proxy != "" {
options = append(options, chromedp.ProxyServer(proxy))
printDebug("Using CrawlerProxy for Chromedp: %s", proxy)
} else {
printWarn("CrawlerProxy is enabled but no valid proxy is available")
}
}
// // Add additional Chrome
// options = append(options,
// chromedp.Flag("headless", true),
// chromedp.Flag("disable-gpu", true),
// chromedp.Flag("no-sandbox", true),
// chromedp.Flag("disable-setuid-sandbox", true),
// )
return options
}
// extractStandard does the normal HTML parse with OG, Twitter, etc.
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create request for %s: %v", pageURL, err)
return
}
req.Header.Set("User-Agent", userAgent)
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
// Use CrawlerProxy if enabled
var resp *http.Response
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
resp, err = crawlerProxyClient.Do(req)
} else {
resp, err = client.Do(req)
}
if err != nil {
printDebug("Failed to GET %s: %v", pageURL, err)
return
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
return
}
doc, err := html.Parse(resp.Body)
if err != nil {
printDebug("HTML parse error for %s: %v", pageURL, err)
return
}
return extractParsedDOM(doc)
}
// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
var ogTitle, ogDesc string
var twTitle, twDesc string
var foundTitle, foundDesc bool
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
switch strings.ToLower(n.Data) {
case "title":
if n.FirstChild != nil {
title = n.FirstChild.Data
foundTitle = true
}
case "meta":
var metaName, metaProperty, contentVal string
for _, attr := range n.Attr {
switch strings.ToLower(attr.Key) {
case "name":
metaName = strings.ToLower(attr.Val)
case "property":
metaProperty = strings.ToLower(attr.Val)
case "content":
contentVal = attr.Val
}
}
switch metaName {
case "description":
desc = contentVal
foundDesc = true
case "keywords":
keywords = contentVal
case "twitter:title":
twTitle = contentVal
case "twitter:description":
twDesc = contentVal
}
switch metaProperty {
case "og:title":
ogTitle = contentVal
case "og:description":
ogDesc = contentVal
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
// fallback to OG/Twitter if missing
if !foundTitle {
if ogTitle != "" {
title = ogTitle
} else if twTitle != "" {
title = twTitle
}
}
if !foundDesc {
if ogDesc != "" {
desc = ogDesc
} else if twDesc != "" {
desc = twDesc
}
}
// Heuristic check
if looksLikeRawHTML(title) {
title = ""
}
if looksLikeRawHTML(desc) {
desc = ""
}
return title, desc, keywords
}
// fallbackReadability tries go-readability if title/desc is missing.
func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
if title != "" && desc != "" {
return title, desc, keywords
}
client := &http.Client{Timeout: 15 * time.Second}
readReq, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create fallbackReadability request: %v", err)
return title, desc, keywords
}
readReq.Header.Set("User-Agent", userAgent)
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
// Use CrawlerProxy if enabled
var readResp *http.Response
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
readResp, err = crawlerProxyClient.Do(readReq)
} else {
readResp, err = client.Do(readReq)
}
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
if err != nil {
printDebug("go-readability GET error for %s: %v", pageURL, err)
}
if readResp != nil {
readResp.Body.Close()
}
return title, desc, keywords
}
defer readResp.Body.Close()
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL: %v", parseErr)
return title, desc, keywords
}
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr != nil {
printDebug("go-readability error for %s: %v", pageURL, rdErr)
return title, desc, keywords
}
if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
title = article.Title
}
if desc == "" {
if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
desc = article.Excerpt
} else if len(article.Content) > 0 {
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
if !looksLikeRawHTML(snippet) {
desc = snippet
}
}
}
return title, desc, keywords
}
// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
func looksLikeRawHTML(text string) bool {
textLower := strings.ToLower(text)
if strings.Contains(textLower, "readability-page") {
return true
}
if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 {
return true
}
return false
}
// sanitize removes pipes/newlines so they don't break our output format.
func sanitize(input string) string {
input = strings.ReplaceAll(input, "|", " ")
input = strings.ReplaceAll(input, "\n", " ")
return strings.TrimSpace(input)
}