package main import ( "context" "net/http" "net/url" "strings" "time" "github.com/chromedp/cdproto/emulation" "github.com/chromedp/chromedp" "github.com/go-shiori/go-readability" "golang.org/x/net/html" ) // fetchPageMetadataStandard tries standard HTML parse + go-readability only. func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) { // 1. Standard HTML parse title, desc, keywords := extractStandard(pageURL, userAgent) // 2. Fallback: go-readability if title == "" || desc == "" { title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords) } // If still empty, return ("", "", "") if title == "" || desc == "" { return "", "", "" } return sanitize(title), sanitize(desc), sanitize(keywords) } // fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages. func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) { // Create a custom allocator context for Chromedp with proxy support if enabled allocCtx, cancelAlloc := chromedp.NewExecAllocator(context.Background(), configureChromeOptions()...) defer cancelAlloc() // Create a browser context ctx, cancel := chromedp.NewContext(allocCtx) defer cancel() var renderedHTML string err := chromedp.Run(ctx, emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"), chromedp.Navigate(pageURL), chromedp.Sleep(2*time.Second), // Let JS run a bit chromedp.OuterHTML("html", &renderedHTML), ) if err != nil { printDebug("chromedp error for %s: %v", pageURL, err) return "", "", "" } doc, err := html.Parse(strings.NewReader(renderedHTML)) if err != nil { printDebug("chromedp parse error for %s: %v", pageURL, err) return "", "", "" } return extractParsedDOM(doc) } // configureChromeOptions sets up Chrome options and proxy if CrawlerProxy is enabled. func configureChromeOptions() []chromedp.ExecAllocatorOption { options := chromedp.DefaultExecAllocatorOptions[:] if config.CrawlerProxyEnabled && crawlerProxyClient != nil { // Retrieve proxy settings from CrawlerProxy proxy := crawlerProxyClient.GetProxy() // Ensure a `GetProxy` method is implemented for your proxy client if proxy != "" { options = append(options, chromedp.ProxyServer(proxy)) printDebug("Using CrawlerProxy for Chromedp: %s", proxy) } else { printWarn("CrawlerProxy is enabled but no valid proxy is available") } } // // Add additional Chrome // options = append(options, // chromedp.Flag("headless", true), // chromedp.Flag("disable-gpu", true), // chromedp.Flag("no-sandbox", true), // chromedp.Flag("disable-setuid-sandbox", true), // ) return options } // extractStandard does the normal HTML parse with OG, Twitter, etc. func extractStandard(pageURL, userAgent string) (title, desc, keywords string) { client := &http.Client{Timeout: 15 * time.Second} req, err := http.NewRequest("GET", pageURL, nil) if err != nil { printDebug("Failed to create request for %s: %v", pageURL, err) return } req.Header.Set("User-Agent", userAgent) req.Header.Set("Accept-Language", "en-US,en;q=0.9") // Use CrawlerProxy if enabled var resp *http.Response if config.CrawlerProxyEnabled && crawlerProxyClient != nil { resp, err = crawlerProxyClient.Do(req) } else { resp, err = client.Do(req) } if err != nil { printDebug("Failed to GET %s: %v", pageURL, err) return } defer resp.Body.Close() if resp.StatusCode < 200 || resp.StatusCode >= 300 { printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) return } doc, err := html.Parse(resp.Body) if err != nil { printDebug("HTML parse error for %s: %v", pageURL, err) return } return extractParsedDOM(doc) } // extractParsedDOM uses the same logic to parse
2 { return true } return false } // sanitize removes pipes/newlines so they don't break our output format. func sanitize(input string) string { input = strings.ReplaceAll(input, "|", " ") input = strings.ReplaceAll(input, "\n", " ") return strings.TrimSpace(input) }