package main import ( "net/http" "net/url" "strings" "time" "github.com/go-shiori/go-readability" "golang.org/x/net/html" ) // fetchPageMetadata tries extracting title/description/keywords from standard HTML, // OG, Twitter, then falls back to go-readability if needed. If after all that we // still have no title or no description, we return ("", "", "") so the caller // can skip saving it. // // 1. , <meta name="description"/>, <meta name="keywords"/> // 2. <meta property="og:title">, <meta property="og:description"> // 3. <meta name="twitter:title">, <meta name="twitter:description"> // 4. go-readability fallback (if title or description is still missing) // 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”). func fetchPageMetadata(pageURL string) (string, string, string) { userAgent, err := GetUserAgent("crawler") if err != nil { printDebug("Failed to generate User-Agent: %v", err) return "", "", "" } client := &http.Client{Timeout: 15 * time.Second} req, err := http.NewRequest("GET", pageURL, nil) if err != nil { printDebug("Failed to create request for %s: %v", pageURL, err) return "", "", "" } // Force English content when possible req.Header.Set("User-Agent", userAgent) req.Header.Set("Accept-Language", "en-US,en;q=0.9") resp, err := client.Do(req) if err != nil { printDebug("Failed to GET %s: %v", pageURL, err) return "", "", "" } defer resp.Body.Close() // Skip non-2xx if resp.StatusCode < 200 || resp.StatusCode >= 300 { printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) return "", "", "" } // First pass: standard HTML parse doc, err := html.Parse(resp.Body) if err != nil { printDebug("HTML parse error for %s: %v", pageURL, err) return "", "", "" } var ( title, desc, keywords string ogTitle, ogDesc string twTitle, twDesc string foundTitle, foundDesc bool ) var walk func(*html.Node) walk = func(n *html.Node) { if n.Type == html.ElementNode { switch strings.ToLower(n.Data) { case "title": if n.FirstChild != nil { title = n.FirstChild.Data foundTitle = true } case "meta": var metaName, metaProperty, contentVal string for _, attr := range n.Attr { switch strings.ToLower(attr.Key) { case "name": metaName = strings.ToLower(attr.Val) case "property": metaProperty = strings.ToLower(attr.Val) case "content": contentVal = attr.Val } } // Standard meta tags switch metaName { case "description": desc = contentVal foundDesc = true case "keywords": keywords = contentVal case "twitter:title": twTitle = contentVal case "twitter:description": twDesc = contentVal } // Open Graph tags switch metaProperty { case "og:title": ogTitle = contentVal case "og:description": ogDesc = contentVal } } } for c := n.FirstChild; c != nil; c = c.NextSibling { walk(c) } } walk(doc) // Fallback to OG or Twitter if <title>/description are missing if !foundTitle { if ogTitle != "" { title = ogTitle } else if twTitle != "" { title = twTitle } } if !foundDesc { if ogDesc != "" { desc = ogDesc } else if twDesc != "" { desc = twDesc } } // If still missing title or desc, fallback to go-readability if title == "" || desc == "" { parsedURL, parseErr := url.Parse(pageURL) if parseErr != nil { printDebug("Failed to parse URL %s: %v", pageURL, parseErr) // We must skip if we can't parse the URL for readability return "", "", "" } readResp, readErr := client.Get(pageURL) if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 { defer readResp.Body.Close() article, rdErr := readability.FromReader(readResp.Body, parsedURL) if rdErr == nil { // If we still have no title, try from readability if title == "" && article.Title != "" { title = article.Title } // If we still have no description, try article.Excerpt if desc == "" && article.Excerpt != "" { desc = article.Excerpt } else if desc == "" && len(article.Content) > 0 { // If excerpt is empty, use a snippet from article.Content snippet := article.Content if len(snippet) > 200 { snippet = snippet[:200] + "..." } desc = snippet } } else { printDebug("go-readability failed for %s: %v", pageURL, rdErr) } } } // Heuristic: discard obviously incorrect HTML-y strings or placeholders if looksLikeRawHTML(title) { title = "" } if looksLikeRawHTML(desc) { desc = "" } // If after all that we have no title or description, skip if title == "" || desc == "" { return "", "", "" } return sanitize(title), sanitize(desc), sanitize(keywords) } // looksLikeRawHTML is a simple heuristic to check for leftover HTML or // go-readability noise (e.g., "readability-page-1"). func looksLikeRawHTML(text string) bool { textLower := strings.ToLower(text) if strings.Contains(textLower, "readability-page") { return true } if strings.Count(textLower, "<div") > 0 || strings.Count(textLower, "<p") > 2 { return true } return false } // sanitize removes pipes and newlines so they don't break our output format. func sanitize(input string) string { input = strings.ReplaceAll(input, "|", " ") input = strings.ReplaceAll(input, "\n", " ") return strings.TrimSpace(input) }