Search/crawler-extraction.go

package main

import (
	"net/http"
	"net/url"
	"strings"
	"time"

	"github.com/go-shiori/go-readability"
	"golang.org/x/net/html"
)

// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
// OG, Twitter, then falls back to go-readability if needed. If after all that we
// still have no title or no description, we return ("", "", "") so the caller
// can skip saving it.
//
//  1. <title>, <meta name="description"/>, <meta name="keywords"/>
//  2. <meta property="og:title">, <meta property="og:description">
//  3. <meta name="twitter:title">, <meta name="twitter:description">
//  4. go-readability fallback (if title or description is still missing)
//  5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
func fetchPageMetadata(pageURL string) (string, string, string) {
	userAgent, err := GetUserAgent("crawler")
	if err != nil {
		printDebug("Failed to generate User-Agent: %v", err)
		return "", "", ""
	}

	client := &http.Client{Timeout: 15 * time.Second}
	req, err := http.NewRequest("GET", pageURL, nil)
	if err != nil {
		printDebug("Failed to create request for %s: %v", pageURL, err)
		return "", "", ""
	}

	// Force English content when possible
	req.Header.Set("User-Agent", userAgent)
	req.Header.Set("Accept-Language", "en-US,en;q=0.9")

	resp, err := client.Do(req)
	if err != nil {
		printDebug("Failed to GET %s: %v", pageURL, err)
		return "", "", ""
	}
	defer resp.Body.Close()

	// Skip non-2xx
	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
		return "", "", ""
	}

	// First pass: standard HTML parse
	doc, err := html.Parse(resp.Body)
	if err != nil {
		printDebug("HTML parse error for %s: %v", pageURL, err)
		return "", "", ""
	}

	var (
		title, desc, keywords string
		ogTitle, ogDesc       string
		twTitle, twDesc       string
		foundTitle, foundDesc bool
	)

	var walk func(*html.Node)
	walk = func(n *html.Node) {
		if n.Type == html.ElementNode {
			switch strings.ToLower(n.Data) {
			case "title":
				if n.FirstChild != nil {
					title = n.FirstChild.Data
					foundTitle = true
				}
			case "meta":
				var metaName, metaProperty, contentVal string
				for _, attr := range n.Attr {
					switch strings.ToLower(attr.Key) {
					case "name":
						metaName = strings.ToLower(attr.Val)
					case "property":
						metaProperty = strings.ToLower(attr.Val)
					case "content":
						contentVal = attr.Val
					}
				}

				// Standard meta tags
				switch metaName {
				case "description":
					desc = contentVal
					foundDesc = true
				case "keywords":
					keywords = contentVal
				case "twitter:title":
					twTitle = contentVal
				case "twitter:description":
					twDesc = contentVal
				}

				// Open Graph tags
				switch metaProperty {
				case "og:title":
					ogTitle = contentVal
				case "og:description":
					ogDesc = contentVal
				}
			}
		}
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c)
		}
	}
	walk(doc)

	// Fallback to OG or Twitter if <title>/description are missing
	if !foundTitle {
		if ogTitle != "" {
			title = ogTitle
		} else if twTitle != "" {
			title = twTitle
		}
	}
	if !foundDesc {
		if ogDesc != "" {
			desc = ogDesc
		} else if twDesc != "" {
			desc = twDesc
		}
	}

	// If still missing title or desc, fallback to go-readability
	if title == "" || desc == "" {
		parsedURL, parseErr := url.Parse(pageURL)
		if parseErr != nil {
			printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
			// We must skip if we can't parse the URL for readability
			return "", "", ""
		}

		readResp, readErr := client.Get(pageURL)
		if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
			defer readResp.Body.Close()

			article, rdErr := readability.FromReader(readResp.Body, parsedURL)
			if rdErr == nil {
				// If we still have no title, try from readability
				if title == "" && article.Title != "" {
					title = article.Title
				}
				// If we still have no description, try article.Excerpt
				if desc == "" && article.Excerpt != "" {
					desc = article.Excerpt
				} else if desc == "" && len(article.Content) > 0 {
					// If excerpt is empty, use a snippet from article.Content
					snippet := article.Content
					if len(snippet) > 200 {
						snippet = snippet[:200] + "..."
					}
					desc = snippet
				}
			} else {
				printDebug("go-readability failed for %s: %v", pageURL, rdErr)
			}
		}
	}

	// Heuristic: discard obviously incorrect HTML-y strings or placeholders
	if looksLikeRawHTML(title) {
		title = ""
	}
	if looksLikeRawHTML(desc) {
		desc = ""
	}

	// If after all that we have no title or description, skip
	if title == "" || desc == "" {
		return "", "", ""
	}

	return sanitize(title), sanitize(desc), sanitize(keywords)
}

// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
// go-readability noise (e.g., "readability-page-1").
func looksLikeRawHTML(text string) bool {
	textLower := strings.ToLower(text)
	if strings.Contains(textLower, "readability-page") {
		return true
	}
	if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 {
		return true
	}
	return false
}

// sanitize removes pipes and newlines so they don't break our output format.
func sanitize(input string) string {
	input = strings.ReplaceAll(input, "|", " ")
	input = strings.ReplaceAll(input, "\n", " ")
	return strings.TrimSpace(input)
}
improved crawler data extraction 2025-01-01 13:49:16 +01:00			`package main`

			`import (`
			`"net/http"`
			`"net/url"`
			`"strings"`
			`"time"`

			`"github.com/go-shiori/go-readability"`
			`"golang.org/x/net/html"`
			`)`

			`// fetchPageMetadata tries extracting title/description/keywords from standard HTML,`
			`// OG, Twitter, then falls back to go-readability if needed. If after all that we`
			`// still have no title or no description, we return ("", "", "") so the caller`
			`// can skip saving it.`
			`//`
			`// 1. <title>, <meta name="description"/>, <meta name="keywords"/>`
			`// 2. <meta property="og:title">, <meta property="og:description">`
			`// 3. <meta name="twitter:title">, <meta name="twitter:description">`
			`// 4. go-readability fallback (if title or description is still missing)`
			`// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).`
			`func fetchPageMetadata(pageURL string) (string, string, string) {`
			`userAgent, err := GetUserAgent("crawler")`
			`if err != nil {`
			`printDebug("Failed to generate User-Agent: %v", err)`
			`return "", "", ""`
			`}`

			`client := &http.Client{Timeout: 15 * time.Second}`
			`req, err := http.NewRequest("GET", pageURL, nil)`
			`if err != nil {`
			`printDebug("Failed to create request for %s: %v", pageURL, err)`
			`return "", "", ""`
			`}`

			`// Force English content when possible`
			`req.Header.Set("User-Agent", userAgent)`
			`req.Header.Set("Accept-Language", "en-US,en;q=0.9")`

			`resp, err := client.Do(req)`
			`if err != nil {`
			`printDebug("Failed to GET %s: %v", pageURL, err)`
			`return "", "", ""`
			`}`
			`defer resp.Body.Close()`

			`// Skip non-2xx`
			`if resp.StatusCode < 200 \|\| resp.StatusCode >= 300 {`
			`printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)`
			`return "", "", ""`
			`}`

			`// First pass: standard HTML parse`
			`doc, err := html.Parse(resp.Body)`
			`if err != nil {`
			`printDebug("HTML parse error for %s: %v", pageURL, err)`
			`return "", "", ""`
			`}`

			`var (`
			`title, desc, keywords string`
			`ogTitle, ogDesc string`
			`twTitle, twDesc string`
			`foundTitle, foundDesc bool`
			`)`

			`var walk func(*html.Node)`
			`walk = func(n *html.Node) {`
			`if n.Type == html.ElementNode {`
			`switch strings.ToLower(n.Data) {`
			`case "title":`
			`if n.FirstChild != nil {`
			`title = n.FirstChild.Data`
			`foundTitle = true`
			`}`
			`case "meta":`
			`var metaName, metaProperty, contentVal string`
			`for _, attr := range n.Attr {`
			`switch strings.ToLower(attr.Key) {`
			`case "name":`
			`metaName = strings.ToLower(attr.Val)`
			`case "property":`
			`metaProperty = strings.ToLower(attr.Val)`
			`case "content":`
			`contentVal = attr.Val`
			`}`
			`}`

			`// Standard meta tags`
			`switch metaName {`
			`case "description":`
			`desc = contentVal`
			`foundDesc = true`
			`case "keywords":`
			`keywords = contentVal`
			`case "twitter:title":`
			`twTitle = contentVal`
			`case "twitter:description":`
			`twDesc = contentVal`
			`}`

			`// Open Graph tags`
			`switch metaProperty {`
			`case "og:title":`
			`ogTitle = contentVal`
			`case "og:description":`
			`ogDesc = contentVal`
			`}`
			`}`
			`}`
			`for c := n.FirstChild; c != nil; c = c.NextSibling {`
			`walk(c)`
			`}`
			`}`
			`walk(doc)`

			`// Fallback to OG or Twitter if <title>/description are missing`
			`if !foundTitle {`
			`if ogTitle != "" {`
			`title = ogTitle`
			`} else if twTitle != "" {`
			`title = twTitle`
			`}`
			`}`
			`if !foundDesc {`
			`if ogDesc != "" {`
			`desc = ogDesc`
			`} else if twDesc != "" {`
			`desc = twDesc`
			`}`
			`}`

			`// If still missing title or desc, fallback to go-readability`
			`if title == "" \|\| desc == "" {`
			`parsedURL, parseErr := url.Parse(pageURL)`
			`if parseErr != nil {`
			`printDebug("Failed to parse URL %s: %v", pageURL, parseErr)`
			`// We must skip if we can't parse the URL for readability`
			`return "", "", ""`
			`}`

			`readResp, readErr := client.Get(pageURL)`
			`if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {`
			`defer readResp.Body.Close()`

			`article, rdErr := readability.FromReader(readResp.Body, parsedURL)`
			`if rdErr == nil {`
			`// If we still have no title, try from readability`
			`if title == "" && article.Title != "" {`
			`title = article.Title`
			`}`
			`// If we still have no description, try article.Excerpt`
			`if desc == "" && article.Excerpt != "" {`
			`desc = article.Excerpt`
			`} else if desc == "" && len(article.Content) > 0 {`
			`// If excerpt is empty, use a snippet from article.Content`
			`snippet := article.Content`
			`if len(snippet) > 200 {`
			`snippet = snippet[:200] + "..."`
			`}`
			`desc = snippet`
			`}`
			`} else {`
			`printDebug("go-readability failed for %s: %v", pageURL, rdErr)`
			`}`
			`}`
			`}`

			`// Heuristic: discard obviously incorrect HTML-y strings or placeholders`
			`if looksLikeRawHTML(title) {`
			`title = ""`
			`}`
			`if looksLikeRawHTML(desc) {`
			`desc = ""`
			`}`

			`// If after all that we have no title or description, skip`
			`if title == "" \|\| desc == "" {`
			`return "", "", ""`
			`}`

			`return sanitize(title), sanitize(desc), sanitize(keywords)`
			`}`

			`// looksLikeRawHTML is a simple heuristic to check for leftover HTML or`
			`// go-readability noise (e.g., "readability-page-1").`
			`func looksLikeRawHTML(text string) bool {`
			`textLower := strings.ToLower(text)`
			`if strings.Contains(textLower, "readability-page") {`
			`return true`
			`}`
			`if strings.Count(textLower, "<div") > 0 \|\| strings.Count(textLower, "<p") > 2 {`
			`return true`
			`}`
			`return false`
			`}`

			`// sanitize removes pipes and newlines so they don't break our output format.`
			`func sanitize(input string) string {`
			`input = strings.ReplaceAll(input, "\|", " ")`
			`input = strings.ReplaceAll(input, "\n", " ")`
			`return strings.TrimSpace(input)`
			`}`