Search/crawler-extraction.go

//go:build experimental
// +build experimental

package main

import (
	"context"
	"net/http"
	"net/url"
	"strings"
	"time"

	"github.com/chromedp/cdproto/emulation"
	"github.com/chromedp/chromedp"
	"github.com/go-shiori/go-readability"
	"golang.org/x/net/html"
)

// fetchPageMetadataStandard tries standard HTML parse + go-readability only.
func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
	// 1. Standard HTML parse
	title, desc, keywords := extractStandard(pageURL, userAgent)

	// 2. Fallback: go-readability
	if title == "" || desc == "" {
		title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
	}

	// If still empty, return ("", "", "")
	if title == "" || desc == "" {
		return "", "", ""
	}
	return sanitize(title), sanitize(desc), sanitize(keywords)
}

// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
	// Create a custom allocator context for Chromedp with proxy support if enabled
	allocCtx, cancelAlloc := chromedp.NewExecAllocator(context.Background(), configureChromeOptions()...)
	defer cancelAlloc()

	// Create a browser context
	ctx, cancel := chromedp.NewContext(allocCtx)
	defer cancel()

	var renderedHTML string
	err := chromedp.Run(ctx,
		emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
		chromedp.Navigate(pageURL),
		chromedp.Sleep(2*time.Second), // Let JS run a bit
		chromedp.OuterHTML("html", &renderedHTML),
	)
	if err != nil {
		printDebug("chromedp error for %s: %v", pageURL, err)
		return "", "", ""
	}

	doc, err := html.Parse(strings.NewReader(renderedHTML))
	if err != nil {
		printDebug("chromedp parse error for %s: %v", pageURL, err)
		return "", "", ""
	}

	return extractParsedDOM(doc)
}

// configureChromeOptions sets up Chrome options and proxy if CrawlerProxy is enabled.
func configureChromeOptions() []chromedp.ExecAllocatorOption {
	options := chromedp.DefaultExecAllocatorOptions[:]

	// This code is not using config.CrawlerProxyRetry
	if config.CrawlerProxyEnabled && crawlerProxyClient != nil {
		// Retrieve proxy settings from CrawlerProxy
		proxy := crawlerProxyClient.GetProxy() // Ensure a `GetProxy` method is implemented for your proxy client
		if proxy != "" {
			options = append(options, chromedp.ProxyServer(proxy))
			printDebug("Using CrawlerProxy for Chromedp: %s", proxy)
		} else {
			printWarn("CrawlerProxy is enabled but no valid proxy is available")
		}
	}

	// // Add additional Chrome
	// options = append(options,
	// 	chromedp.Flag("headless", true),
	// 	chromedp.Flag("disable-gpu", true),
	// 	chromedp.Flag("no-sandbox", true),
	// 	chromedp.Flag("disable-setuid-sandbox", true),
	// )

	return options
}

// extractStandard does the normal HTML parse with OG, Twitter, etc.
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {

	req, err := http.NewRequest("GET", pageURL, nil)
	if err != nil {
		printDebug("Failed to create request for %s: %v", pageURL, err)
		return
	}
	req.Header.Set("User-Agent", userAgent)
	req.Header.Set("Accept-Language", "en-US,en;q=0.9")

	// Use CrawlerProxy if enabled
	resp, err := DoCrawlerProxyRequest(req)
	if err != nil {
		printDebug("Failed to GET %s: %v", pageURL, err)
		return
	}
	defer resp.Body.Close()

	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
		return
	}

	doc, err := html.Parse(resp.Body)
	if err != nil {
		printDebug("HTML parse error for %s: %v", pageURL, err)
		return
	}

	return extractParsedDOM(doc)
}

// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
	var ogTitle, ogDesc string
	var twTitle, twDesc string
	var foundTitle, foundDesc bool

	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode {
			switch strings.ToLower(n.Data) {
			case "title":
				if n.FirstChild != nil {
					title = n.FirstChild.Data
					foundTitle = true
				}
			case "meta":
				var metaName, metaProperty, contentVal string
				for _, attr := range n.Attr {
					switch strings.ToLower(attr.Key) {
					case "name":
						metaName = strings.ToLower(attr.Val)
					case "property":
						metaProperty = strings.ToLower(attr.Val)
					case "content":
						contentVal = attr.Val
					}
				}

				switch metaName {
				case "description":
					desc = contentVal
					foundDesc = true
				case "keywords":
					keywords = contentVal
				case "twitter:title":
					twTitle = contentVal
				case "twitter:description":
					twDesc = contentVal
				}

				switch metaProperty {
				case "og:title":
					ogTitle = contentVal
				case "og:description":
					ogDesc = contentVal
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)

	// fallback to OG/Twitter if missing
	if !foundTitle {
		if ogTitle != "" {
			title = ogTitle
		} else if twTitle != "" {
			title = twTitle
		}
	}
	if !foundDesc {
		if ogDesc != "" {
			desc = ogDesc
		} else if twDesc != "" {
			desc = twDesc
		}
	}

	// Heuristic check
	if looksLikeRawHTML(title) {
		title = ""
	}
	if looksLikeRawHTML(desc) {
		desc = ""
	}

	return title, desc, keywords
}

// fallbackReadability tries go-readability if title/desc is missing.
func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
	if title != "" && desc != "" {
		return title, desc, keywords
	}

	readReq, err := http.NewRequest("GET", pageURL, nil)
	if err != nil {
		printDebug("Failed to create fallbackReadability request: %v", err)
		return title, desc, keywords
	}
	readReq.Header.Set("User-Agent", userAgent)
	readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")

	// Use CrawlerProxy if enabled
	readResp, err := DoCrawlerProxyRequest(readReq)
	if err != nil {
		printDebug("go-readability GET error for %s: %v", pageURL, err)
		return title, desc, keywords
	}

	if readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
		printDebug("go-readability GET returned status %d for %s", readResp.StatusCode, pageURL)
		readResp.Body.Close() // Safely close body
		return title, desc, keywords
	}
	defer readResp.Body.Close()

	parsedURL, parseErr := url.Parse(pageURL)
	if parseErr != nil {
		printDebug("Failed to parse URL: %v", parseErr)
		return title, desc, keywords
	}

	article, rdErr := readability.FromReader(readResp.Body, parsedURL)
	if rdErr != nil {
		printDebug("go-readability error for %s: %v", pageURL, rdErr)
		return title, desc, keywords
	}

	if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
		title = article.Title
	}
	if desc == "" {
		if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
			desc = article.Excerpt
		} else if len(article.Content) > 0 {
			snippet := article.Content
			if len(snippet) > 200 {
				snippet = snippet[:200] + "..."
			}
			if !looksLikeRawHTML(snippet) {
				desc = snippet
			}
		}
	}
	return title, desc, keywords
}

// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
func looksLikeRawHTML(text string) bool {
	textLower := strings.ToLower(text)
	if strings.Contains(textLower, "readability-page") {
		return true
	}
	if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 {
		return true
	}
	return false
}

// sanitize removes pipes/newlines so they don't break our output format.
func sanitize(input string) string {
	input = strings.ReplaceAll(input, "|", " ")
	input = strings.ReplaceAll(input, "\n", " ")
	return strings.TrimSpace(input)
}