Search/crawler-extraction.go

205 lines
5.4 KiB
Go
Raw Normal View History

2025-01-01 13:49:16 +01:00
package main
import (
"net/http"
"net/url"
"strings"
"time"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
)
// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
// OG, Twitter, then falls back to go-readability if needed. If after all that we
// still have no title or no description, we return ("", "", "") so the caller
// can skip saving it.
//
// 1. <title>, <meta name="description"/>, <meta name="keywords"/>
// 2. <meta property="og:title">, <meta property="og:description">
// 3. <meta name="twitter:title">, <meta name="twitter:description">
// 4. go-readability fallback (if title or description is still missing)
// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
func fetchPageMetadata(pageURL string) (string, string, string) {
userAgent, err := GetUserAgent("crawler")
if err != nil {
printDebug("Failed to generate User-Agent: %v", err)
return "", "", ""
}
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create request for %s: %v", pageURL, err)
return "", "", ""
}
// Force English content when possible
req.Header.Set("User-Agent", userAgent)
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := client.Do(req)
if err != nil {
printDebug("Failed to GET %s: %v", pageURL, err)
return "", "", ""
}
defer resp.Body.Close()
// Skip non-2xx
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
return "", "", ""
}
// First pass: standard HTML parse
doc, err := html.Parse(resp.Body)
if err != nil {
printDebug("HTML parse error for %s: %v", pageURL, err)
return "", "", ""
}
var (
title, desc, keywords string
ogTitle, ogDesc string
twTitle, twDesc string
foundTitle, foundDesc bool
)
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
switch strings.ToLower(n.Data) {
case "title":
if n.FirstChild != nil {
title = n.FirstChild.Data
foundTitle = true
}
case "meta":
var metaName, metaProperty, contentVal string
for _, attr := range n.Attr {
switch strings.ToLower(attr.Key) {
case "name":
metaName = strings.ToLower(attr.Val)
case "property":
metaProperty = strings.ToLower(attr.Val)
case "content":
contentVal = attr.Val
}
}
// Standard meta tags
switch metaName {
case "description":
desc = contentVal
foundDesc = true
case "keywords":
keywords = contentVal
case "twitter:title":
twTitle = contentVal
case "twitter:description":
twDesc = contentVal
}
// Open Graph tags
switch metaProperty {
case "og:title":
ogTitle = contentVal
case "og:description":
ogDesc = contentVal
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
// Fallback to OG or Twitter if <title>/description are missing
if !foundTitle {
if ogTitle != "" {
title = ogTitle
} else if twTitle != "" {
title = twTitle
}
}
if !foundDesc {
if ogDesc != "" {
desc = ogDesc
} else if twDesc != "" {
desc = twDesc
}
}
// If still missing title or desc, fallback to go-readability
if title == "" || desc == "" {
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
// We must skip if we can't parse the URL for readability
return "", "", ""
}
readResp, readErr := client.Get(pageURL)
if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
defer readResp.Body.Close()
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr == nil {
// If we still have no title, try from readability
if title == "" && article.Title != "" {
title = article.Title
}
// If we still have no description, try article.Excerpt
if desc == "" && article.Excerpt != "" {
desc = article.Excerpt
} else if desc == "" && len(article.Content) > 0 {
// If excerpt is empty, use a snippet from article.Content
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
desc = snippet
}
} else {
printDebug("go-readability failed for %s: %v", pageURL, rdErr)
}
}
}
// Heuristic: discard obviously incorrect HTML-y strings or placeholders
if looksLikeRawHTML(title) {
title = ""
}
if looksLikeRawHTML(desc) {
desc = ""
}
// If after all that we have no title or description, skip
if title == "" || desc == "" {
return "", "", ""
}
return sanitize(title), sanitize(desc), sanitize(keywords)
}
// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
// go-readability noise (e.g., "readability-page-1").
func looksLikeRawHTML(text string) bool {
textLower := strings.ToLower(text)
if strings.Contains(textLower, "readability-page") {
return true
}
if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 {
return true
}
return false
}
// sanitize removes pipes and newlines so they don't break our output format.
func sanitize(input string) string {
input = strings.ReplaceAll(input, "|", " ")
input = strings.ReplaceAll(input, "\n", " ")
return strings.TrimSpace(input)
}