Search/favicon.go
partisan 5032173609
Some checks failed
Run Integration Tests / test (push) Failing after 41s
Added default globe.svg for invalid favicons
2025-05-30 23:14:49 +02:00

574 lines
14 KiB
Go

package main
import (
"bytes"
"crypto/md5"
"crypto/tls"
"encoding/base64"
"encoding/hex"
"fmt"
"image"
"image/gif"
"image/jpeg"
"image/png"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
"github.com/chai2010/webp"
"github.com/fyne-io/image/ico"
"golang.org/x/image/bmp"
"golang.org/x/image/draw"
"golang.org/x/image/tiff"
"golang.org/x/net/html"
)
var (
faviconCache = struct {
sync.RWMutex
m map[string]bool // tracks in-progress downloads
}{m: make(map[string]bool)}
// Common favicon paths to try
commonFaviconPaths = []string{
"/favicon.ico",
"/favicon.png",
"/favicon.jpg",
"/favicon.jpeg",
"/favicon.webp",
"/apple-touch-icon.png",
"/apple-touch-icon-precomposed.png",
}
// Regex to extract favicon URLs from HTML
iconLinkRegex = regexp.MustCompile(`<link[^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["'][^>]+href=["']([^"']+)["']`)
)
// Add this near the top with other vars
var (
faviconDownloadQueue = make(chan faviconDownloadRequest, 1000)
)
type faviconDownloadRequest struct {
faviconURL string
pageURL string
cacheID string
}
func init() {
// Start 5 worker goroutines to process favicon downloads
for i := 0; i < 5; i++ {
go faviconDownloadWorker()
}
}
func faviconDownloadWorker() {
for req := range faviconDownloadQueue {
cacheFavicon(req.faviconURL, req.cacheID)
}
}
// Generates a cache ID from URL
func faviconIDFromURL(rawURL string) string {
hasher := md5.New()
hasher.Write([]byte(rawURL))
return hex.EncodeToString(hasher.Sum(nil))
}
// Resolves favicon URL using multiple methods
func resolveFaviconURL(rawFavicon, pageURL string) (faviconURL, cacheID string) {
cacheID = faviconIDFromURL(pageURL)
// Handle data URLs first
if strings.HasPrefix(rawFavicon, "data:image") {
parts := strings.SplitN(rawFavicon, ";base64,", 2)
if len(parts) == 2 {
data, err := base64.StdEncoding.DecodeString(parts[1])
if err == nil {
hasher := md5.New()
hasher.Write(data)
return rawFavicon, hex.EncodeToString(hasher.Sum(nil))
}
}
return "", "" // Invalid data URL
}
// Existing URL handling logic
if rawFavicon != "" && strings.HasPrefix(rawFavicon, "http") {
cacheID = faviconIDFromURL(rawFavicon)
return rawFavicon, cacheID
}
parsedPage, err := url.Parse(pageURL)
if err != nil {
return "", ""
}
// Method 1: Parse HTML
if favicon := findFaviconInHTML(pageURL); favicon != "" {
if strings.HasPrefix(favicon, "http") {
return favicon, faviconIDFromURL(favicon)
}
resolved := resolveRelativeURL(parsedPage, favicon)
return resolved, faviconIDFromURL(resolved)
}
// Method 2: Common paths
for _, path := range commonFaviconPaths {
testURL := "https://" + parsedPage.Host + path
if checkURLExists(testURL) {
return testURL, faviconIDFromURL(testURL)
}
}
// Method 3: HTTP headers
if headerIcon := findFaviconInHeaders(pageURL); headerIcon != "" {
if strings.HasPrefix(headerIcon, "http") {
return headerIcon, faviconIDFromURL(headerIcon)
}
resolved := resolveRelativeURL(parsedPage, headerIcon)
return resolved, faviconIDFromURL(resolved)
}
// Fallback
fallbackURL := "https://" + parsedPage.Host + "/favicon.ico"
return fallbackURL, faviconIDFromURL(fallbackURL)
}
// Checks HTTP headers for favicon links
func findFaviconInHeaders(pageURL string) string {
client := &http.Client{
Timeout: 3 * time.Second, // like 3 seconds for favicon should be enough
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}
req, err := http.NewRequest("HEAD", pageURL, nil)
if err != nil {
return ""
}
// Add User-Agent
userAgent, err := GetUserAgent("findFaviconInHeaders")
if err != nil {
printWarn("Error getting User-Agent: %v", err)
}
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
return ""
}
defer resp.Body.Close()
// Check Link headers (common for favicons)
if links, ok := resp.Header["Link"]; ok {
for _, link := range links {
parts := strings.Split(link, ";")
if len(parts) < 2 {
continue
}
urlPart := strings.TrimSpace(parts[0])
if !strings.HasPrefix(urlPart, "<") || !strings.HasSuffix(urlPart, ">") {
continue
}
urlPart = urlPart[1 : len(urlPart)-1] // Remove < and >
for _, part := range parts[1:] {
part = strings.TrimSpace(part)
if strings.EqualFold(part, `rel="icon"`) ||
strings.EqualFold(part, `rel=icon`) ||
strings.EqualFold(part, `rel="shortcut icon"`) ||
strings.EqualFold(part, `rel=shortcut icon`) {
return urlPart
}
}
}
}
return ""
}
// Helper to resolve relative URLs
func resolveRelativeURL(base *url.URL, relative string) string {
if strings.HasPrefix(relative, "http") {
return relative
}
if strings.HasPrefix(relative, "//") {
return base.Scheme + ":" + relative
}
if strings.HasPrefix(relative, "/") {
return base.Scheme + "://" + base.Host + relative
}
return base.Scheme + "://" + base.Host + base.Path + "/" + relative
}
// Checks if a URL exists (returns 200 OK)
func checkURLExists(url string) bool {
client := &http.Client{
Timeout: 5 * time.Second,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}
req, err := http.NewRequest("HEAD", url, nil)
if err != nil {
return false
}
// Add User-Agent
userAgent, err := GetUserAgent("Text-Search-Brave")
if err != nil {
printWarn("Error getting User-Agent: %v", err)
}
req.Header.Set("checkURLExists", userAgent)
resp, err := client.Do(req)
if err != nil {
return false
}
resp.Body.Close()
return resp.StatusCode == http.StatusOK
}
// Fetches HTML and looks for favicon links
func findFaviconInHTML(pageURL string) string {
client := &http.Client{
Timeout: 10 * time.Second,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
return ""
}
// Add User-Agent
userAgent, err := GetUserAgent("findFaviconInHTML")
if err != nil {
printWarn("Error getting User-Agent: %v", err)
}
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
return ""
}
defer resp.Body.Close()
// Check if this is an AMP page
isAMP := false
for _, attr := range resp.Header["Link"] {
if strings.Contains(attr, "rel=\"amphtml\"") {
isAMP = true
break
}
}
// Parse HTML
doc, err := html.Parse(resp.Body)
if err != nil {
return ""
}
var faviconURL string
var findLinks func(*html.Node)
findLinks = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "link" {
var rel, href string
for _, attr := range n.Attr {
switch attr.Key {
case "rel":
rel = attr.Val
case "href":
href = attr.Val
}
}
// Prioritize different favicon types
if href != "" {
switch rel {
case "icon", "shortcut icon", "apple-touch-icon", "apple-touch-icon-precomposed":
// For AMP pages, prefer the non-versioned URL if possible
if isAMP {
if u, err := url.Parse(href); err == nil {
u.RawQuery = "" // Remove query parameters
href = u.String()
}
}
if faviconURL == "" || // First found
rel == "apple-touch-icon" || // Prefer apple-touch-icon
rel == "icon" { // Then regular icon
faviconURL = href
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
findLinks(c)
}
}
findLinks(doc)
return faviconURL
}
func getFaviconProxyURL(rawFavicon, pageURL string) string {
if pageURL == "" {
return "/static/images/globe.svg"
}
cacheID := faviconIDFromURL(pageURL)
filename := fmt.Sprintf("%s_icon.webp", cacheID)
cachedPath := filepath.Join(config.DriveCache.Path, "images", filename)
if _, err := os.Stat(cachedPath); err == nil {
return fmt.Sprintf("/image/%s_icon.webp", cacheID)
}
// Resolve URL
faviconURL, _ := resolveFaviconURL(rawFavicon, pageURL)
if faviconURL == "" {
recordInvalidImageID(cacheID)
return "/static/images/globe.svg"
}
// Check if already downloading
faviconCache.RLock()
downloading := faviconCache.m[cacheID]
faviconCache.RUnlock()
if !downloading {
faviconCache.Lock()
faviconCache.m[cacheID] = true
faviconCache.Unlock()
// Send to download queue instead of starting goroutine
faviconDownloadQueue <- faviconDownloadRequest{
faviconURL: faviconURL,
pageURL: pageURL,
cacheID: cacheID,
}
}
return fmt.Sprintf("/image/%s_icon.webp", cacheID)
}
// Caches favicon, always saving *_icon.webp
func cacheFavicon(imageURL, imageID string) (string, bool, error) {
// if imageURL == "" {
// recordInvalidImageID(imageID)
// return "", false, fmt.Errorf("empty image URL for image ID %s", imageID)
// }
// Debug
fmt.Printf("Downloading favicon [%s] for ID [%s]\n", imageURL, imageID)
filename := fmt.Sprintf("%s_icon.webp", imageID)
imageCacheDir := filepath.Join(config.DriveCache.Path, "images")
if err := os.MkdirAll(imageCacheDir, 0755); err != nil {
return "", false, fmt.Errorf("couldn't create images folder: %v", err)
}
cachedImagePath := filepath.Join(imageCacheDir, filename)
tempImagePath := cachedImagePath + ".tmp"
// Already cached?
if _, err := os.Stat(cachedImagePath); err == nil {
return cachedImagePath, true, nil
}
cachingImagesMu.Lock()
if _, exists := cachingImages[imageURL]; !exists {
cachingImages[imageURL] = &sync.Mutex{}
}
mu := cachingImages[imageURL]
cachingImagesMu.Unlock()
mu.Lock()
defer mu.Unlock()
// Recheck after lock
if _, err := os.Stat(cachedImagePath); err == nil {
return cachedImagePath, true, nil
}
cachingSemaphore <- struct{}{}
defer func() { <-cachingSemaphore }()
var data []byte
var contentType string
// Handle data URLs
if strings.HasPrefix(imageURL, "data:") {
commaIndex := strings.Index(imageURL, ",")
if commaIndex == -1 {
recordInvalidImageID(imageID)
return "", false, fmt.Errorf("invalid data URL: no comma")
}
headerPart := imageURL[:commaIndex]
dataPart := imageURL[commaIndex+1:]
mediaType := "text/plain"
base64Encoded := false
if strings.HasPrefix(headerPart, "data:") {
mediaTypePart := headerPart[5:]
mediaTypeParts := strings.SplitN(mediaTypePart, ";", 2)
mediaType = mediaTypeParts[0]
if len(mediaTypeParts) > 1 {
for _, param := range strings.Split(mediaTypeParts[1], ";") {
param = strings.TrimSpace(param)
if param == "base64" {
base64Encoded = true
}
}
}
}
if base64Encoded {
data, _ = base64.StdEncoding.DecodeString(dataPart)
} else {
decodedStr, err := url.QueryUnescape(dataPart)
if err != nil {
data = []byte(dataPart)
} else {
data = []byte(decodedStr)
}
}
contentType = mediaType
} else {
// Download from HTTP URL
client := &http.Client{
Timeout: 15 * time.Second,
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}
req, err := http.NewRequest("GET", imageURL, nil)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
// Add User-Agent
userAgent, err := GetUserAgent("Text-Search-Brave")
if err != nil {
printWarn("Error getting User-Agent: %v", err)
}
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
defer resp.Body.Close()
data, err = io.ReadAll(resp.Body)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
contentType = http.DetectContentType(data)
}
if !strings.HasPrefix(contentType, "image/") {
recordInvalidImageID(imageID)
return "", false, fmt.Errorf("URL did not return an image: %s", imageURL)
}
// SVG special case
if contentType == "image/svg+xml" {
err := os.WriteFile(tempImagePath, data, 0644)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
err = os.Rename(tempImagePath, cachedImagePath)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
cachingImagesMu.Lock()
delete(cachingImages, imageURL)
cachingImagesMu.Unlock()
return cachedImagePath, true, nil
}
// Decode image
var img image.Image
var err error
switch contentType {
case "image/x-icon", "image/vnd.microsoft.icon":
img, err = ico.Decode(bytes.NewReader(data))
case "image/jpeg":
img, err = jpeg.Decode(bytes.NewReader(data))
case "image/png":
img, err = png.Decode(bytes.NewReader(data))
case "image/gif":
img, err = gif.Decode(bytes.NewReader(data))
case "image/webp":
img, err = webp.Decode(bytes.NewReader(data))
case "image/bmp":
img, err = bmp.Decode(bytes.NewReader(data))
case "image/tiff":
img, err = tiff.Decode(bytes.NewReader(data))
default:
recordInvalidImageID(imageID)
return "", false, fmt.Errorf("unsupported image type: %s", contentType)
}
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
// Resize
maxSize := 16
width := img.Bounds().Dx()
height := img.Bounds().Dy()
if width > maxSize || height > maxSize {
dst := image.NewRGBA(image.Rect(0, 0, maxSize, maxSize))
draw.ApproxBiLinear.Scale(dst, dst.Bounds(), img, img.Bounds(), draw.Over, nil)
img = dst
}
// Save as WebP
outFile, err := os.Create(tempImagePath)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
defer outFile.Close()
options := &webp.Options{Lossless: false, Quality: 80}
err = webp.Encode(outFile, img, options)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
err = os.Rename(tempImagePath, cachedImagePath)
if err != nil {
recordInvalidImageID(imageID)
return "", false, err
}
cachingImagesMu.Lock()
delete(cachingImages, imageURL)
cachingImagesMu.Unlock()
return cachedImagePath, true, nil
}