added SOCKS5 proxy support
All checks were successful
Run Integration Tests / test (push) Successful in 33s

This commit is contained in:
partisan 2025-01-12 16:46:52 +01:00
parent 234f1dd3be
commit 614ce8903e
22 changed files with 501 additions and 106 deletions

View file

@ -32,8 +32,12 @@ func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, strin
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
// Create context
ctx, cancel := chromedp.NewContext(context.Background())
// Create a custom allocator context for Chromedp with proxy support if enabled
allocCtx, cancelAlloc := chromedp.NewExecAllocator(context.Background(), configureChromeOptions()...)
defer cancelAlloc()
// Create a browser context
ctx, cancel := chromedp.NewContext(allocCtx)
defer cancel()
var renderedHTML string
@ -57,6 +61,32 @@ func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string)
return extractParsedDOM(doc)
}
// configureChromeOptions sets up Chrome options and proxy if CrawlerProxy is enabled.
func configureChromeOptions() []chromedp.ExecAllocatorOption {
options := chromedp.DefaultExecAllocatorOptions[:]
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
// Retrieve proxy settings from CrawlerProxy
proxy := crawlerProxyClient.GetProxy() // Ensure a `GetProxy` method is implemented for your proxy client
if proxy != "" {
options = append(options, chromedp.ProxyServer(proxy))
printDebug("Using CrawlerProxy for Chromedp: %s", proxy)
} else {
printWarn("CrawlerProxy is enabled but no valid proxy is available")
}
}
// // Add additional Chrome
// options = append(options,
// chromedp.Flag("headless", true),
// chromedp.Flag("disable-gpu", true),
// chromedp.Flag("no-sandbox", true),
// chromedp.Flag("disable-setuid-sandbox", true),
// )
return options
}
// extractStandard does the normal HTML parse with OG, Twitter, etc.
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
client := &http.Client{Timeout: 15 * time.Second}
@ -68,7 +98,13 @@ func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
req.Header.Set("User-Agent", userAgent)
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := client.Do(req)
// Use CrawlerProxy if enabled
var resp *http.Response
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
resp, err = crawlerProxyClient.Do(req)
} else {
resp, err = client.Do(req)
}
if err != nil {
printDebug("Failed to GET %s: %v", pageURL, err)
return
@ -185,7 +221,13 @@ func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (stri
readReq.Header.Set("User-Agent", userAgent)
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
readResp, err := client.Do(readReq)
// Use CrawlerProxy if enabled
var readResp *http.Response
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
readResp, err = crawlerProxyClient.Do(readReq)
} else {
readResp, err = client.Do(readReq)
}
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
if err != nil {
printDebug("go-readability GET error for %s: %v", pageURL, err)