package main import ( "context" "net/http" "net/url" "strings" "time" "github.com/chromedp/cdproto/emulation" "github.com/chromedp/chromedp" "github.com/go-shiori/go-readability" "golang.org/x/net/html" ) // fetchPageMetadataStandard tries standard HTML parse + go-readability only. func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) { // 1. Standard HTML parse title, desc, keywords := extractStandard(pageURL, userAgent) // 2. Fallback: go-readability if title == "" || desc == "" { title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords) } // If still empty, return ("", "", "") if title == "" || desc == "" { return "", "", "" } return sanitize(title), sanitize(desc), sanitize(keywords) } // fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages. func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) { // Create a custom allocator context for Chromedp with proxy support if enabled allocCtx, cancelAlloc := chromedp.NewExecAllocator(context.Background(), configureChromeOptions()...) defer cancelAlloc() // Create a browser context ctx, cancel := chromedp.NewContext(allocCtx) defer cancel() var renderedHTML string err := chromedp.Run(ctx, emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"), chromedp.Navigate(pageURL), chromedp.Sleep(2*time.Second), // Let JS run a bit chromedp.OuterHTML("html", &renderedHTML), ) if err != nil { printDebug("chromedp error for %s: %v", pageURL, err) return "", "", "" } doc, err := html.Parse(strings.NewReader(renderedHTML)) if err != nil { printDebug("chromedp parse error for %s: %v", pageURL, err) return "", "", "" } return extractParsedDOM(doc) } // configureChromeOptions sets up Chrome options and proxy if CrawlerProxy is enabled. func configureChromeOptions() []chromedp.ExecAllocatorOption { options := chromedp.DefaultExecAllocatorOptions[:] if config.CrawlerProxyEnabled && crawlerProxyClient != nil { // Retrieve proxy settings from CrawlerProxy proxy := crawlerProxyClient.GetProxy() // Ensure a `GetProxy` method is implemented for your proxy client if proxy != "" { options = append(options, chromedp.ProxyServer(proxy)) printDebug("Using CrawlerProxy for Chromedp: %s", proxy) } else { printWarn("CrawlerProxy is enabled but no valid proxy is available") } } // // Add additional Chrome // options = append(options, // chromedp.Flag("headless", true), // chromedp.Flag("disable-gpu", true), // chromedp.Flag("no-sandbox", true), // chromedp.Flag("disable-setuid-sandbox", true), // ) return options } // extractStandard does the normal HTML parse with OG, Twitter, etc. func extractStandard(pageURL, userAgent string) (title, desc, keywords string) { client := &http.Client{Timeout: 15 * time.Second} req, err := http.NewRequest("GET", pageURL, nil) if err != nil { printDebug("Failed to create request for %s: %v", pageURL, err) return } req.Header.Set("User-Agent", userAgent) req.Header.Set("Accept-Language", "en-US,en;q=0.9") // Use CrawlerProxy if enabled var resp *http.Response if config.CrawlerProxyEnabled && crawlerProxyClient != nil { resp, err = crawlerProxyClient.Do(req) } else { resp, err = client.Do(req) } if err != nil { printDebug("Failed to GET %s: %v", pageURL, err) return } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) return } doc, err := html.Parse(resp.Body) if err != nil { printDebug("HTML parse error for %s: %v", pageURL, err) return } return extractParsedDOM(doc) } // extractParsedDOM uses the same logic to parse , meta, OG, Twitter. func extractParsedDOM(doc *html.Node) (title, desc, keywords string) { var ogTitle, ogDesc string var twTitle, twDesc string var foundTitle, foundDesc bool var walk func(*html.Node) walk = func(n *html.Node) { if n.Type == html.ElementNode { switch strings.ToLower(n.Data) { case "title": if n.FirstChild != nil { title = n.FirstChild.Data foundTitle = true } case "meta": var metaName, metaProperty, contentVal string for _, attr := range n.Attr { switch strings.ToLower(attr.Key) { case "name": metaName = strings.ToLower(attr.Val) case "property": metaProperty = strings.ToLower(attr.Val) case "content": contentVal = attr.Val } } switch metaName { case "description": desc = contentVal foundDesc = true case "keywords": keywords = contentVal case "twitter:title": twTitle = contentVal case "twitter:description": twDesc = contentVal } switch metaProperty { case "og:title": ogTitle = contentVal case "og:description": ogDesc = contentVal } } } for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(doc) // fallback to OG/Twitter if missing if !foundTitle { if ogTitle != "" { title = ogTitle } else if twTitle != "" { title = twTitle } } if !foundDesc { if ogDesc != "" { desc = ogDesc } else if twDesc != "" { desc = twDesc } } // Heuristic check if looksLikeRawHTML(title) { title = "" } if looksLikeRawHTML(desc) { desc = "" } return title, desc, keywords } // fallbackReadability tries go-readability if title/desc is missing. func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) { if title != "" && desc != "" { return title, desc, keywords } client := &http.Client{Timeout: 15 * time.Second} readReq, err := http.NewRequest("GET", pageURL, nil) if err != nil { printDebug("Failed to create fallbackReadability request: %v", err) return title, desc, keywords } readReq.Header.Set("User-Agent", userAgent) readReq.Header.Set("Accept-Language", "en-US,en;q=0.9") // Use CrawlerProxy if enabled var readResp *http.Response if config.CrawlerProxyEnabled && crawlerProxyClient != nil { readResp, err = crawlerProxyClient.Do(readReq) } else { readResp, err = client.Do(readReq) } if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 { if err != nil { printDebug("go-readability GET error for %s: %v", pageURL, err) } if readResp != nil { readResp.Body.Close() } return title, desc, keywords } defer readResp.Body.Close() parsedURL, parseErr := url.Parse(pageURL) if parseErr != nil { printDebug("Failed to parse URL: %v", parseErr) return title, desc, keywords } article, rdErr := readability.FromReader(readResp.Body, parsedURL) if rdErr != nil { printDebug("go-readability error for %s: %v", pageURL, rdErr) return title, desc, keywords } if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) { title = article.Title } if desc == "" { if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) { desc = article.Excerpt } else if len(article.Content) > 0 { snippet := article.Content if len(snippet) > 200 { snippet = snippet[:200] + "..." } if !looksLikeRawHTML(snippet) { desc = snippet } } } return title, desc, keywords } // looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text func looksLikeRawHTML(text string) bool { textLower := strings.ToLower(text) if strings.Contains(textLower, "readability-page") { return true } if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 { return true } return false } // sanitize removes pipes/newlines so they don't break our output format. func sanitize(input string) string { input = strings.ReplaceAll(input, "|", " ") input = strings.ReplaceAll(input, "\n", " ") return strings.TrimSpace(input) }