added SOCKS5 proxy support
All checks were successful
Run Integration Tests / test (push) Successful in 33s
All checks were successful
Run Integration Tests / test (push) Successful in 33s
This commit is contained in:
parent
234f1dd3be
commit
614ce8903e
22 changed files with 501 additions and 106 deletions
|
@ -32,8 +32,12 @@ func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, strin
|
|||
|
||||
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
|
||||
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
|
||||
// Create context
|
||||
ctx, cancel := chromedp.NewContext(context.Background())
|
||||
// Create a custom allocator context for Chromedp with proxy support if enabled
|
||||
allocCtx, cancelAlloc := chromedp.NewExecAllocator(context.Background(), configureChromeOptions()...)
|
||||
defer cancelAlloc()
|
||||
|
||||
// Create a browser context
|
||||
ctx, cancel := chromedp.NewContext(allocCtx)
|
||||
defer cancel()
|
||||
|
||||
var renderedHTML string
|
||||
|
@ -57,6 +61,32 @@ func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string)
|
|||
return extractParsedDOM(doc)
|
||||
}
|
||||
|
||||
// configureChromeOptions sets up Chrome options and proxy if CrawlerProxy is enabled.
|
||||
func configureChromeOptions() []chromedp.ExecAllocatorOption {
|
||||
options := chromedp.DefaultExecAllocatorOptions[:]
|
||||
|
||||
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
|
||||
// Retrieve proxy settings from CrawlerProxy
|
||||
proxy := crawlerProxyClient.GetProxy() // Ensure a `GetProxy` method is implemented for your proxy client
|
||||
if proxy != "" {
|
||||
options = append(options, chromedp.ProxyServer(proxy))
|
||||
printDebug("Using CrawlerProxy for Chromedp: %s", proxy)
|
||||
} else {
|
||||
printWarn("CrawlerProxy is enabled but no valid proxy is available")
|
||||
}
|
||||
}
|
||||
|
||||
// // Add additional Chrome
|
||||
// options = append(options,
|
||||
// chromedp.Flag("headless", true),
|
||||
// chromedp.Flag("disable-gpu", true),
|
||||
// chromedp.Flag("no-sandbox", true),
|
||||
// chromedp.Flag("disable-setuid-sandbox", true),
|
||||
// )
|
||||
|
||||
return options
|
||||
}
|
||||
|
||||
// extractStandard does the normal HTML parse with OG, Twitter, etc.
|
||||
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
|
||||
client := &http.Client{Timeout: 15 * time.Second}
|
||||
|
@ -68,7 +98,13 @@ func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
|
|||
req.Header.Set("User-Agent", userAgent)
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
|
||||
resp, err := client.Do(req)
|
||||
// Use CrawlerProxy if enabled
|
||||
var resp *http.Response
|
||||
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
|
||||
resp, err = crawlerProxyClient.Do(req)
|
||||
} else {
|
||||
resp, err = client.Do(req)
|
||||
}
|
||||
if err != nil {
|
||||
printDebug("Failed to GET %s: %v", pageURL, err)
|
||||
return
|
||||
|
@ -185,7 +221,13 @@ func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (stri
|
|||
readReq.Header.Set("User-Agent", userAgent)
|
||||
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
|
||||
readResp, err := client.Do(readReq)
|
||||
// Use CrawlerProxy if enabled
|
||||
var readResp *http.Response
|
||||
if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
|
||||
readResp, err = crawlerProxyClient.Do(readReq)
|
||||
} else {
|
||||
readResp, err = client.Do(readReq)
|
||||
}
|
||||
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
|
||||
if err != nil {
|
||||
printDebug("go-readability GET error for %s: %v", pageURL, err)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue