574 lines
14 KiB
Go
574 lines
14 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"crypto/md5"
|
|
"crypto/tls"
|
|
"encoding/base64"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"image"
|
|
"image/gif"
|
|
"image/jpeg"
|
|
"image/png"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/chai2010/webp"
|
|
"github.com/fyne-io/image/ico"
|
|
"golang.org/x/image/bmp"
|
|
"golang.org/x/image/draw"
|
|
"golang.org/x/image/tiff"
|
|
"golang.org/x/net/html"
|
|
)
|
|
|
|
var (
|
|
faviconCache = struct {
|
|
sync.RWMutex
|
|
m map[string]bool // tracks in-progress downloads
|
|
}{m: make(map[string]bool)}
|
|
|
|
// Common favicon paths to try
|
|
commonFaviconPaths = []string{
|
|
"/favicon.ico",
|
|
"/favicon.png",
|
|
"/favicon.jpg",
|
|
"/favicon.jpeg",
|
|
"/favicon.webp",
|
|
"/apple-touch-icon.png",
|
|
"/apple-touch-icon-precomposed.png",
|
|
}
|
|
|
|
// Regex to extract favicon URLs from HTML
|
|
iconLinkRegex = regexp.MustCompile(`<link[^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["'][^>]+href=["']([^"']+)["']`)
|
|
)
|
|
|
|
// Add this near the top with other vars
|
|
var (
|
|
faviconDownloadQueue = make(chan faviconDownloadRequest, 1000)
|
|
)
|
|
|
|
type faviconDownloadRequest struct {
|
|
faviconURL string
|
|
pageURL string
|
|
cacheID string
|
|
}
|
|
|
|
func init() {
|
|
// Start 5 worker goroutines to process favicon downloads
|
|
for i := 0; i < 5; i++ {
|
|
go faviconDownloadWorker()
|
|
}
|
|
}
|
|
|
|
func faviconDownloadWorker() {
|
|
for req := range faviconDownloadQueue {
|
|
cacheFavicon(req.faviconURL, req.cacheID)
|
|
}
|
|
}
|
|
|
|
// Generates a cache ID from URL
|
|
func faviconIDFromURL(rawURL string) string {
|
|
hasher := md5.New()
|
|
hasher.Write([]byte(rawURL))
|
|
return hex.EncodeToString(hasher.Sum(nil))
|
|
}
|
|
|
|
// Resolves favicon URL using multiple methods
|
|
func resolveFaviconURL(rawFavicon, pageURL string) (faviconURL, cacheID string) {
|
|
cacheID = faviconIDFromURL(pageURL)
|
|
|
|
// Handle data URLs first
|
|
if strings.HasPrefix(rawFavicon, "data:image") {
|
|
parts := strings.SplitN(rawFavicon, ";base64,", 2)
|
|
if len(parts) == 2 {
|
|
data, err := base64.StdEncoding.DecodeString(parts[1])
|
|
if err == nil {
|
|
hasher := md5.New()
|
|
hasher.Write(data)
|
|
return rawFavicon, hex.EncodeToString(hasher.Sum(nil))
|
|
}
|
|
}
|
|
return "", "" // Invalid data URL
|
|
}
|
|
|
|
// Existing URL handling logic
|
|
if rawFavicon != "" && strings.HasPrefix(rawFavicon, "http") {
|
|
cacheID = faviconIDFromURL(rawFavicon)
|
|
return rawFavicon, cacheID
|
|
}
|
|
|
|
parsedPage, err := url.Parse(pageURL)
|
|
if err != nil {
|
|
return "", ""
|
|
}
|
|
|
|
// Method 1: Parse HTML
|
|
if favicon := findFaviconInHTML(pageURL); favicon != "" {
|
|
if strings.HasPrefix(favicon, "http") {
|
|
return favicon, faviconIDFromURL(favicon)
|
|
}
|
|
resolved := resolveRelativeURL(parsedPage, favicon)
|
|
return resolved, faviconIDFromURL(resolved)
|
|
}
|
|
|
|
// Method 2: Common paths
|
|
for _, path := range commonFaviconPaths {
|
|
testURL := "https://" + parsedPage.Host + path
|
|
if checkURLExists(testURL) {
|
|
return testURL, faviconIDFromURL(testURL)
|
|
}
|
|
}
|
|
|
|
// Method 3: HTTP headers
|
|
if headerIcon := findFaviconInHeaders(pageURL); headerIcon != "" {
|
|
if strings.HasPrefix(headerIcon, "http") {
|
|
return headerIcon, faviconIDFromURL(headerIcon)
|
|
}
|
|
resolved := resolveRelativeURL(parsedPage, headerIcon)
|
|
return resolved, faviconIDFromURL(resolved)
|
|
}
|
|
|
|
// Fallback
|
|
fallbackURL := "https://" + parsedPage.Host + "/favicon.ico"
|
|
return fallbackURL, faviconIDFromURL(fallbackURL)
|
|
}
|
|
|
|
// Checks HTTP headers for favicon links
|
|
func findFaviconInHeaders(pageURL string) string {
|
|
client := &http.Client{
|
|
Timeout: 3 * time.Second, // like 3 seconds for favicon should be enough
|
|
Transport: &http.Transport{
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
},
|
|
}
|
|
|
|
req, err := http.NewRequest("HEAD", pageURL, nil)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
// Add User-Agent
|
|
userAgent, err := GetUserAgent("findFaviconInHeaders")
|
|
if err != nil {
|
|
printWarn("Error getting User-Agent: %v", err)
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Check Link headers (common for favicons)
|
|
if links, ok := resp.Header["Link"]; ok {
|
|
for _, link := range links {
|
|
parts := strings.Split(link, ";")
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
|
|
urlPart := strings.TrimSpace(parts[0])
|
|
if !strings.HasPrefix(urlPart, "<") || !strings.HasSuffix(urlPart, ">") {
|
|
continue
|
|
}
|
|
|
|
urlPart = urlPart[1 : len(urlPart)-1] // Remove < and >
|
|
for _, part := range parts[1:] {
|
|
part = strings.TrimSpace(part)
|
|
if strings.EqualFold(part, `rel="icon"`) ||
|
|
strings.EqualFold(part, `rel=icon`) ||
|
|
strings.EqualFold(part, `rel="shortcut icon"`) ||
|
|
strings.EqualFold(part, `rel=shortcut icon`) {
|
|
return urlPart
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
// Helper to resolve relative URLs
|
|
func resolveRelativeURL(base *url.URL, relative string) string {
|
|
if strings.HasPrefix(relative, "http") {
|
|
return relative
|
|
}
|
|
if strings.HasPrefix(relative, "//") {
|
|
return base.Scheme + ":" + relative
|
|
}
|
|
if strings.HasPrefix(relative, "/") {
|
|
return base.Scheme + "://" + base.Host + relative
|
|
}
|
|
return base.Scheme + "://" + base.Host + base.Path + "/" + relative
|
|
}
|
|
|
|
// Checks if a URL exists (returns 200 OK)
|
|
func checkURLExists(url string) bool {
|
|
client := &http.Client{
|
|
Timeout: 5 * time.Second,
|
|
Transport: &http.Transport{
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
},
|
|
}
|
|
req, err := http.NewRequest("HEAD", url, nil)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
// Add User-Agent
|
|
userAgent, err := GetUserAgent("Text-Search-Brave")
|
|
if err != nil {
|
|
printWarn("Error getting User-Agent: %v", err)
|
|
}
|
|
req.Header.Set("checkURLExists", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
resp.Body.Close()
|
|
return resp.StatusCode == http.StatusOK
|
|
}
|
|
|
|
// Fetches HTML and looks for favicon links
|
|
func findFaviconInHTML(pageURL string) string {
|
|
client := &http.Client{
|
|
Timeout: 10 * time.Second,
|
|
Transport: &http.Transport{
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
},
|
|
}
|
|
|
|
req, err := http.NewRequest("GET", pageURL, nil)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
// Add User-Agent
|
|
userAgent, err := GetUserAgent("findFaviconInHTML")
|
|
if err != nil {
|
|
printWarn("Error getting User-Agent: %v", err)
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Check if this is an AMP page
|
|
isAMP := false
|
|
for _, attr := range resp.Header["Link"] {
|
|
if strings.Contains(attr, "rel=\"amphtml\"") {
|
|
isAMP = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// Parse HTML
|
|
doc, err := html.Parse(resp.Body)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
|
|
var faviconURL string
|
|
var findLinks func(*html.Node)
|
|
findLinks = func(n *html.Node) {
|
|
if n.Type == html.ElementNode && n.Data == "link" {
|
|
var rel, href string
|
|
for _, attr := range n.Attr {
|
|
switch attr.Key {
|
|
case "rel":
|
|
rel = attr.Val
|
|
case "href":
|
|
href = attr.Val
|
|
}
|
|
}
|
|
|
|
// Prioritize different favicon types
|
|
if href != "" {
|
|
switch rel {
|
|
case "icon", "shortcut icon", "apple-touch-icon", "apple-touch-icon-precomposed":
|
|
// For AMP pages, prefer the non-versioned URL if possible
|
|
if isAMP {
|
|
if u, err := url.Parse(href); err == nil {
|
|
u.RawQuery = "" // Remove query parameters
|
|
href = u.String()
|
|
}
|
|
}
|
|
if faviconURL == "" || // First found
|
|
rel == "apple-touch-icon" || // Prefer apple-touch-icon
|
|
rel == "icon" { // Then regular icon
|
|
faviconURL = href
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
findLinks(c)
|
|
}
|
|
}
|
|
findLinks(doc)
|
|
|
|
return faviconURL
|
|
}
|
|
|
|
func getFaviconProxyURL(rawFavicon, pageURL string) string {
|
|
if pageURL == "" {
|
|
return "/static/images/globe.svg"
|
|
}
|
|
|
|
cacheID := faviconIDFromURL(pageURL)
|
|
filename := fmt.Sprintf("%s_icon.webp", cacheID)
|
|
cachedPath := filepath.Join(config.DriveCache.Path, "images", filename)
|
|
|
|
if _, err := os.Stat(cachedPath); err == nil {
|
|
return fmt.Sprintf("/image/%s_icon.webp", cacheID)
|
|
}
|
|
|
|
// Resolve URL
|
|
faviconURL, _ := resolveFaviconURL(rawFavicon, pageURL)
|
|
if faviconURL == "" {
|
|
recordInvalidImageID(cacheID)
|
|
return "/static/images/globe.svg"
|
|
}
|
|
|
|
// Check if already downloading
|
|
faviconCache.RLock()
|
|
downloading := faviconCache.m[cacheID]
|
|
faviconCache.RUnlock()
|
|
|
|
if !downloading {
|
|
faviconCache.Lock()
|
|
faviconCache.m[cacheID] = true
|
|
faviconCache.Unlock()
|
|
|
|
// Send to download queue instead of starting goroutine
|
|
faviconDownloadQueue <- faviconDownloadRequest{
|
|
faviconURL: faviconURL,
|
|
pageURL: pageURL,
|
|
cacheID: cacheID,
|
|
}
|
|
}
|
|
|
|
return fmt.Sprintf("/image/%s_icon.webp", cacheID)
|
|
}
|
|
|
|
// Caches favicon, always saving *_icon.webp
|
|
func cacheFavicon(imageURL, imageID string) (string, bool, error) {
|
|
// if imageURL == "" {
|
|
// recordInvalidImageID(imageID)
|
|
// return "", false, fmt.Errorf("empty image URL for image ID %s", imageID)
|
|
// }
|
|
|
|
// Debug
|
|
fmt.Printf("Downloading favicon [%s] for ID [%s]\n", imageURL, imageID)
|
|
|
|
filename := fmt.Sprintf("%s_icon.webp", imageID)
|
|
imageCacheDir := filepath.Join(config.DriveCache.Path, "images")
|
|
if err := os.MkdirAll(imageCacheDir, 0755); err != nil {
|
|
return "", false, fmt.Errorf("couldn't create images folder: %v", err)
|
|
}
|
|
cachedImagePath := filepath.Join(imageCacheDir, filename)
|
|
tempImagePath := cachedImagePath + ".tmp"
|
|
|
|
// Already cached?
|
|
if _, err := os.Stat(cachedImagePath); err == nil {
|
|
return cachedImagePath, true, nil
|
|
}
|
|
|
|
cachingImagesMu.Lock()
|
|
if _, exists := cachingImages[imageURL]; !exists {
|
|
cachingImages[imageURL] = &sync.Mutex{}
|
|
}
|
|
mu := cachingImages[imageURL]
|
|
cachingImagesMu.Unlock()
|
|
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
|
|
// Recheck after lock
|
|
if _, err := os.Stat(cachedImagePath); err == nil {
|
|
return cachedImagePath, true, nil
|
|
}
|
|
|
|
cachingSemaphore <- struct{}{}
|
|
defer func() { <-cachingSemaphore }()
|
|
|
|
var data []byte
|
|
var contentType string
|
|
|
|
// Handle data URLs
|
|
if strings.HasPrefix(imageURL, "data:") {
|
|
commaIndex := strings.Index(imageURL, ",")
|
|
if commaIndex == -1 {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, fmt.Errorf("invalid data URL: no comma")
|
|
}
|
|
headerPart := imageURL[:commaIndex]
|
|
dataPart := imageURL[commaIndex+1:]
|
|
|
|
mediaType := "text/plain"
|
|
base64Encoded := false
|
|
if strings.HasPrefix(headerPart, "data:") {
|
|
mediaTypePart := headerPart[5:]
|
|
mediaTypeParts := strings.SplitN(mediaTypePart, ";", 2)
|
|
mediaType = mediaTypeParts[0]
|
|
if len(mediaTypeParts) > 1 {
|
|
for _, param := range strings.Split(mediaTypeParts[1], ";") {
|
|
param = strings.TrimSpace(param)
|
|
if param == "base64" {
|
|
base64Encoded = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if base64Encoded {
|
|
data, _ = base64.StdEncoding.DecodeString(dataPart)
|
|
} else {
|
|
decodedStr, err := url.QueryUnescape(dataPart)
|
|
if err != nil {
|
|
data = []byte(dataPart)
|
|
} else {
|
|
data = []byte(decodedStr)
|
|
}
|
|
}
|
|
|
|
contentType = mediaType
|
|
} else {
|
|
// Download from HTTP URL
|
|
client := &http.Client{
|
|
Timeout: 15 * time.Second,
|
|
Transport: &http.Transport{
|
|
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
},
|
|
}
|
|
|
|
req, err := http.NewRequest("GET", imageURL, nil)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
|
|
// Add User-Agent
|
|
userAgent, err := GetUserAgent("Text-Search-Brave")
|
|
if err != nil {
|
|
printWarn("Error getting User-Agent: %v", err)
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
data, err = io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
|
|
contentType = http.DetectContentType(data)
|
|
}
|
|
|
|
if !strings.HasPrefix(contentType, "image/") {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, fmt.Errorf("URL did not return an image: %s", imageURL)
|
|
}
|
|
|
|
// SVG special case
|
|
if contentType == "image/svg+xml" {
|
|
err := os.WriteFile(tempImagePath, data, 0644)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
err = os.Rename(tempImagePath, cachedImagePath)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
cachingImagesMu.Lock()
|
|
delete(cachingImages, imageURL)
|
|
cachingImagesMu.Unlock()
|
|
return cachedImagePath, true, nil
|
|
}
|
|
|
|
// Decode image
|
|
var img image.Image
|
|
var err error
|
|
switch contentType {
|
|
case "image/x-icon", "image/vnd.microsoft.icon":
|
|
img, err = ico.Decode(bytes.NewReader(data))
|
|
case "image/jpeg":
|
|
img, err = jpeg.Decode(bytes.NewReader(data))
|
|
case "image/png":
|
|
img, err = png.Decode(bytes.NewReader(data))
|
|
case "image/gif":
|
|
img, err = gif.Decode(bytes.NewReader(data))
|
|
case "image/webp":
|
|
img, err = webp.Decode(bytes.NewReader(data))
|
|
case "image/bmp":
|
|
img, err = bmp.Decode(bytes.NewReader(data))
|
|
case "image/tiff":
|
|
img, err = tiff.Decode(bytes.NewReader(data))
|
|
default:
|
|
recordInvalidImageID(imageID)
|
|
return "", false, fmt.Errorf("unsupported image type: %s", contentType)
|
|
}
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
|
|
// Resize
|
|
maxSize := 16
|
|
width := img.Bounds().Dx()
|
|
height := img.Bounds().Dy()
|
|
|
|
if width > maxSize || height > maxSize {
|
|
dst := image.NewRGBA(image.Rect(0, 0, maxSize, maxSize))
|
|
draw.ApproxBiLinear.Scale(dst, dst.Bounds(), img, img.Bounds(), draw.Over, nil)
|
|
img = dst
|
|
}
|
|
|
|
// Save as WebP
|
|
outFile, err := os.Create(tempImagePath)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
defer outFile.Close()
|
|
|
|
options := &webp.Options{Lossless: false, Quality: 80}
|
|
err = webp.Encode(outFile, img, options)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
|
|
err = os.Rename(tempImagePath, cachedImagePath)
|
|
if err != nil {
|
|
recordInvalidImageID(imageID)
|
|
return "", false, err
|
|
}
|
|
|
|
cachingImagesMu.Lock()
|
|
delete(cachingImages, imageURL)
|
|
cachingImagesMu.Unlock()
|
|
|
|
return cachedImagePath, true, nil
|
|
}
|