From 0019202571226c888b0abeae2572facb19007eab Mon Sep 17 00:00:00 2001 From: partisan Date: Thu, 29 Aug 2024 17:20:47 +0200 Subject: [PATCH] images from deviantart --- images-deviantart.go | 237 +++++++++++++++++++++++++++++++++++++++++++ images-quant.go | 2 +- images.go | 1 + 3 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 images-deviantart.go diff --git a/images-deviantart.go b/images-deviantart.go new file mode 100644 index 0000000..cbeeea4 --- /dev/null +++ b/images-deviantart.go @@ -0,0 +1,237 @@ +package main + +import ( + "fmt" + "net/http" + "net/url" + "strings" + "sync" + "time" + + "github.com/PuerkitoBio/goquery" +) + +// NextPageCache is a specialized cache for storing next page links +type NextPageCache struct { + mu sync.Mutex + links map[string]string + expiration time.Duration +} + +// NewNextPageCache creates a new NextPageCache with a specified expiration duration +func NewNextPageCache(expiration time.Duration) *NextPageCache { + return &NextPageCache{ + links: make(map[string]string), + expiration: expiration, + } +} + +// Get retrieves the next page link for a given key from the cache +func (npc *NextPageCache) Get(key CacheKey) (string, bool) { + npc.mu.Lock() + defer npc.mu.Unlock() + + link, exists := npc.links[npc.keyToString(key)] + if !exists { + return "", false + } + + return link, true +} + +// Set stores the next page link for a given key in the cache +// Idk it maybye worth it to use "cache.go" for this +func (npc *NextPageCache) Set(key CacheKey, link string) { + npc.mu.Lock() + defer npc.mu.Unlock() + + npc.links[npc.keyToString(key)] = link +} + +// keyToString converts a CacheKey to a string representation +func (npc *NextPageCache) keyToString(key CacheKey) string { + return fmt.Sprintf("%s|%d|%t|%s|%s", key.Query, key.Page, key.Safe, key.Lang, key.Type) +} + +var ( + nextPageCache = NewNextPageCache(6 * time.Hour) // Cache with 6-hour expiration +) + +// PerformDeviantArtImageSearch performs a search on DeviantArt and returns a list of image results +func PerformDeviantArtImageSearch(query, safe, lang string, page int) ([]ImageSearchResult, time.Duration, error) { + startTime := time.Now() + + cacheKey := CacheKey{ + Query: query, + Page: page, + Safe: safe == "active", + Lang: lang, + Type: "deviantart", + } + + // Check if the next page link is cached + var searchURL string + if page > 1 { + if nextPageLink, found := nextPageCache.Get(cacheKey); found { + searchURL = nextPageLink + } else { + return nil, 0, fmt.Errorf("next page link not found in cache") + } + } else { + searchURL = buildDeviantArtSearchURL(query, page) + } + + // Get the User-Agent string + DeviantArtImageUserAgent, err := GetUserAgent("Image-Search-DeviantArt") + if err != nil { + return nil, 0, err + } + + // Make the HTTP request with User-Agent header + client := &http.Client{} + req, err := http.NewRequest("GET", searchURL, nil) + if err != nil { + return nil, 0, fmt.Errorf("creating request: %v", err) + } + req.Header.Set("User-Agent", DeviantArtImageUserAgent) + + resp, err := client.Do(req) + if err != nil { + return nil, 0, fmt.Errorf("making request: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, 0, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + + // Parse the HTML document + doc, err := goquery.NewDocumentFromReader(resp.Body) + if err != nil { + return nil, 0, fmt.Errorf("loading HTML document: %v", err) + } + + // Channel to receive valid image results + resultsChan := make(chan ImageSearchResult) + var wg sync.WaitGroup + + // Extract data using goquery + doc.Find("div._2pZkk div div a").Each(func(i int, s *goquery.Selection) { + // Skip images that are blurred (premium content) + premiumText := s.Find("../div/div/div").Text() + if strings.Contains(premiumText, "Watch the artist to view this deviation") { + return + } + + // Extract image source, fallback on data-src if necessary + imgSrc, exists := s.Find("div img").Attr("srcset") + if !exists { + imgSrc, exists = s.Find("div img").Attr("data-src") + } + if !exists || imgSrc == "" { + return + } + imgSrc = strings.Split(imgSrc, " ")[0] + parsedURL, err := url.Parse(imgSrc) + if err == nil { + parts := strings.Split(parsedURL.Path, "/v1") + parsedURL.Path = parts[0] + imgSrc = parsedURL.String() + } + + // Extract URL and title + resultURL := s.AttrOr("href", "") + title := s.AttrOr("aria-label", "") + + // Only proceed if title, URL, and img_src are not empty + if title != "" && resultURL != "" && imgSrc != "" { + wg.Add(1) + go func(imgSrc, resultURL, title string) { + defer wg.Done() + // Verify if the image URL is accessible + if isValidImageURL(imgSrc, DeviantArtImageUserAgent, resultURL) { + resultsChan <- ImageSearchResult{ + Title: strings.TrimSpace(title), + Media: imgSrc, + Width: 0, + Height: 0, + Source: resultURL, + ThumbProxy: imgSrc, + } + } + }(imgSrc, resultURL, title) + } + }) + + // Close the results channel when all goroutines are done + go func() { + wg.Wait() + close(resultsChan) + }() + + // Collect results from the channel + var results []ImageSearchResult + for result := range resultsChan { + results = append(results, result) + } + + // Cache the next page link, if any + nextPageLink := doc.Find("a._1OGeq").Last().AttrOr("href", "") + if nextPageLink != "" { + nextPageCache.Set(cacheKey, nextPageLink) + } + + duration := time.Since(startTime) + + // Check if the number of results is one or less + if len(results) == 0 { + return nil, duration, fmt.Errorf("no images found") + } + + return results, duration, nil +} + +// buildDeviantArtSearchURL builds the search URL for DeviantArt +func buildDeviantArtSearchURL(query string, page int) string { + baseURL := "https://www.deviantart.com/search" + params := url.Values{} + params.Add("q", query) + return baseURL + "?" + params.Encode() +} + +// isValidImageURL checks if the image URL is accessible with the provided User-Agent +func isValidImageURL(imgSrc, userAgent, referer string) bool { + client := &http.Client{} + req, err := http.NewRequest("HEAD", imgSrc, nil) + if err != nil { + return false + } + + // Set headers to mimic a regular browser request + req.Header.Set("User-Agent", userAgent) + req.Header.Set("Referer", referer) + + resp, err := client.Do(req) + if err != nil { + return false + } + defer resp.Body.Close() + + return resp.StatusCode == http.StatusOK +} + +// // Example usage: +// func main() { +// results, duration, err := PerformDeviantArtImageSearch("kittens", "false", "en", 1) +// if err != nil { +// fmt.Println("Error:", err) +// return +// } + +// fmt.Printf("Search took: %v\n", duration) +// fmt.Printf("Total results: %d\n", len(results)) +// for _, result := range results { +// fmt.Printf("Title: %s\nThumbnail: %s\nMedia: %s\nSource (Original Image URL): %s\n\n", +// result.Title, result.Thumbnail, result.Media, result.Source) +// } +// } diff --git a/images-quant.go b/images-quant.go index d9a9770..ce88500 100644 --- a/images-quant.go +++ b/images-quant.go @@ -58,7 +58,7 @@ func PerformQwantImageSearch(query, safe, lang string, page int) ([]ImageSearchR return nil, 0, fmt.Errorf("creating request: %v", err) } - ImageUserAgent, err := GetUserAgent("Image-Search") + ImageUserAgent, err := GetUserAgent("Image-Search-Quant") if err != nil { return nil, 0, err } diff --git a/images.go b/images.go index dc2c9b0..4c6b957 100755 --- a/images.go +++ b/images.go @@ -13,6 +13,7 @@ var imageSearchEngines []SearchEngine func init() { imageSearchEngines = []SearchEngine{ {Name: "Qwant", Func: wrapImageSearchFunc(PerformQwantImageSearch), Weight: 1}, + {Name: "DeviantArt", Func: wrapImageSearchFunc(PerformDeviantArtImageSearch), Weight: 2}, {Name: "Bing", Func: wrapImageSearchFunc(PerformBingImageSearch), Weight: 2}, // Bing sometimes returns with low amount of images, this leads to danamica page loading not working {Name: "Imgur", Func: wrapImageSearchFunc(PerformImgurImageSearch), Weight: 3}, }