205 lines
5.4 KiB
Go
205 lines
5.4 KiB
Go
|
package main
|
||
|
|
||
|
import (
|
||
|
"net/http"
|
||
|
"net/url"
|
||
|
"strings"
|
||
|
"time"
|
||
|
|
||
|
"github.com/go-shiori/go-readability"
|
||
|
"golang.org/x/net/html"
|
||
|
)
|
||
|
|
||
|
// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
|
||
|
// OG, Twitter, then falls back to go-readability if needed. If after all that we
|
||
|
// still have no title or no description, we return ("", "", "") so the caller
|
||
|
// can skip saving it.
|
||
|
//
|
||
|
// 1. <title>, <meta name="description"/>, <meta name="keywords"/>
|
||
|
// 2. <meta property="og:title">, <meta property="og:description">
|
||
|
// 3. <meta name="twitter:title">, <meta name="twitter:description">
|
||
|
// 4. go-readability fallback (if title or description is still missing)
|
||
|
// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
|
||
|
func fetchPageMetadata(pageURL string) (string, string, string) {
|
||
|
userAgent, err := GetUserAgent("crawler")
|
||
|
if err != nil {
|
||
|
printDebug("Failed to generate User-Agent: %v", err)
|
||
|
return "", "", ""
|
||
|
}
|
||
|
|
||
|
client := &http.Client{Timeout: 15 * time.Second}
|
||
|
req, err := http.NewRequest("GET", pageURL, nil)
|
||
|
if err != nil {
|
||
|
printDebug("Failed to create request for %s: %v", pageURL, err)
|
||
|
return "", "", ""
|
||
|
}
|
||
|
|
||
|
// Force English content when possible
|
||
|
req.Header.Set("User-Agent", userAgent)
|
||
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||
|
|
||
|
resp, err := client.Do(req)
|
||
|
if err != nil {
|
||
|
printDebug("Failed to GET %s: %v", pageURL, err)
|
||
|
return "", "", ""
|
||
|
}
|
||
|
defer resp.Body.Close()
|
||
|
|
||
|
// Skip non-2xx
|
||
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||
|
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
|
||
|
return "", "", ""
|
||
|
}
|
||
|
|
||
|
// First pass: standard HTML parse
|
||
|
doc, err := html.Parse(resp.Body)
|
||
|
if err != nil {
|
||
|
printDebug("HTML parse error for %s: %v", pageURL, err)
|
||
|
return "", "", ""
|
||
|
}
|
||
|
|
||
|
var (
|
||
|
title, desc, keywords string
|
||
|
ogTitle, ogDesc string
|
||
|
twTitle, twDesc string
|
||
|
foundTitle, foundDesc bool
|
||
|
)
|
||
|
|
||
|
var walk func(*html.Node)
|
||
|
walk = func(n *html.Node) {
|
||
|
if n.Type == html.ElementNode {
|
||
|
switch strings.ToLower(n.Data) {
|
||
|
case "title":
|
||
|
if n.FirstChild != nil {
|
||
|
title = n.FirstChild.Data
|
||
|
foundTitle = true
|
||
|
}
|
||
|
case "meta":
|
||
|
var metaName, metaProperty, contentVal string
|
||
|
for _, attr := range n.Attr {
|
||
|
switch strings.ToLower(attr.Key) {
|
||
|
case "name":
|
||
|
metaName = strings.ToLower(attr.Val)
|
||
|
case "property":
|
||
|
metaProperty = strings.ToLower(attr.Val)
|
||
|
case "content":
|
||
|
contentVal = attr.Val
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Standard meta tags
|
||
|
switch metaName {
|
||
|
case "description":
|
||
|
desc = contentVal
|
||
|
foundDesc = true
|
||
|
case "keywords":
|
||
|
keywords = contentVal
|
||
|
case "twitter:title":
|
||
|
twTitle = contentVal
|
||
|
case "twitter:description":
|
||
|
twDesc = contentVal
|
||
|
}
|
||
|
|
||
|
// Open Graph tags
|
||
|
switch metaProperty {
|
||
|
case "og:title":
|
||
|
ogTitle = contentVal
|
||
|
case "og:description":
|
||
|
ogDesc = contentVal
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
|
walk(c)
|
||
|
}
|
||
|
}
|
||
|
walk(doc)
|
||
|
|
||
|
// Fallback to OG or Twitter if <title>/description are missing
|
||
|
if !foundTitle {
|
||
|
if ogTitle != "" {
|
||
|
title = ogTitle
|
||
|
} else if twTitle != "" {
|
||
|
title = twTitle
|
||
|
}
|
||
|
}
|
||
|
if !foundDesc {
|
||
|
if ogDesc != "" {
|
||
|
desc = ogDesc
|
||
|
} else if twDesc != "" {
|
||
|
desc = twDesc
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If still missing title or desc, fallback to go-readability
|
||
|
if title == "" || desc == "" {
|
||
|
parsedURL, parseErr := url.Parse(pageURL)
|
||
|
if parseErr != nil {
|
||
|
printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
|
||
|
// We must skip if we can't parse the URL for readability
|
||
|
return "", "", ""
|
||
|
}
|
||
|
|
||
|
readResp, readErr := client.Get(pageURL)
|
||
|
if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
|
||
|
defer readResp.Body.Close()
|
||
|
|
||
|
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
|
||
|
if rdErr == nil {
|
||
|
// If we still have no title, try from readability
|
||
|
if title == "" && article.Title != "" {
|
||
|
title = article.Title
|
||
|
}
|
||
|
// If we still have no description, try article.Excerpt
|
||
|
if desc == "" && article.Excerpt != "" {
|
||
|
desc = article.Excerpt
|
||
|
} else if desc == "" && len(article.Content) > 0 {
|
||
|
// If excerpt is empty, use a snippet from article.Content
|
||
|
snippet := article.Content
|
||
|
if len(snippet) > 200 {
|
||
|
snippet = snippet[:200] + "..."
|
||
|
}
|
||
|
desc = snippet
|
||
|
}
|
||
|
} else {
|
||
|
printDebug("go-readability failed for %s: %v", pageURL, rdErr)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Heuristic: discard obviously incorrect HTML-y strings or placeholders
|
||
|
if looksLikeRawHTML(title) {
|
||
|
title = ""
|
||
|
}
|
||
|
if looksLikeRawHTML(desc) {
|
||
|
desc = ""
|
||
|
}
|
||
|
|
||
|
// If after all that we have no title or description, skip
|
||
|
if title == "" || desc == "" {
|
||
|
return "", "", ""
|
||
|
}
|
||
|
|
||
|
return sanitize(title), sanitize(desc), sanitize(keywords)
|
||
|
}
|
||
|
|
||
|
// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
|
||
|
// go-readability noise (e.g., "readability-page-1").
|
||
|
func looksLikeRawHTML(text string) bool {
|
||
|
textLower := strings.ToLower(text)
|
||
|
if strings.Contains(textLower, "readability-page") {
|
||
|
return true
|
||
|
}
|
||
|
if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 {
|
||
|
return true
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
// sanitize removes pipes and newlines so they don't break our output format.
|
||
|
func sanitize(input string) string {
|
||
|
input = strings.ReplaceAll(input, "|", " ")
|
||
|
input = strings.ReplaceAll(input, "\n", " ")
|
||
|
return strings.TrimSpace(input)
|
||
|
}
|