improved crawler data extraction (added chromedp)

This commit is contained in:
partisan 2025-01-01 14:50:12 +01:00
parent 3494457336
commit c71808aa1e
6 changed files with 305 additions and 166 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@ cache/
*.min.js *.min.js
*.min.css *.min.css
qgato qgato
test.py

View file

@ -35,7 +35,8 @@ type Config struct {
RamCacheEnabled bool RamCacheEnabled bool
DriveCacheEnabled bool // Added DriveCacheEnabled bool // Added
LogLevel int // Added LogLevel int // Added
ConcurrentCrawlers int // Number of concurrent crawlers ConcurrentStandardCrawlers int
ConcurrentChromeCrawlers int
CrawlingInterval time.Duration // Refres crawled results in... CrawlingInterval time.Duration // Refres crawled results in...
MaxPagesPerDomain int // Max pages to crawl per domain MaxPagesPerDomain int // Max pages to crawl per domain
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
@ -55,7 +56,8 @@ var defaultConfig = Config{
WebsiteEnabled: true, WebsiteEnabled: true,
RamCacheEnabled: true, RamCacheEnabled: true,
DriveCacheEnabled: false, DriveCacheEnabled: false,
ConcurrentCrawlers: 5, ConcurrentStandardCrawlers: 12,
ConcurrentChromeCrawlers: 4,
CrawlingInterval: 24 * time.Hour, CrawlingInterval: 24 * time.Hour,
MaxPagesPerDomain: 10, MaxPagesPerDomain: 10,
IndexRefreshInterval: 2 * time.Minute, IndexRefreshInterval: 2 * time.Minute,
@ -249,7 +251,8 @@ func saveConfig(config Config) {
// Indexer section // Indexer section
indexerSec := cfg.Section("Indexer") indexerSec := cfg.Section("Indexer")
indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers)) indexerSec.Key("ConcurrentStandardCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String()) indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
@ -296,7 +299,8 @@ func loadConfig() Config {
driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled) driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled)
// Indexing // Indexing
concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi) concurrentStandardCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentStandardCrawlers"), defaultConfig.ConcurrentStandardCrawlers, strconv.Atoi)
concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi)
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration) indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
@ -326,7 +330,8 @@ func loadConfig() Config {
WebsiteEnabled: websiteEnabled, WebsiteEnabled: websiteEnabled,
RamCacheEnabled: ramCacheEnabled, RamCacheEnabled: ramCacheEnabled,
DriveCacheEnabled: driveCacheEnabled, DriveCacheEnabled: driveCacheEnabled,
ConcurrentCrawlers: concurrentCrawlers, ConcurrentStandardCrawlers: concurrentStandardCrawlers,
ConcurrentChromeCrawlers: concurrentChromeCrawlers,
CrawlingInterval: crawlingInterval, CrawlingInterval: crawlingInterval,
MaxPagesPerDomain: maxPagesPerDomain, MaxPagesPerDomain: maxPagesPerDomain,
IndexRefreshInterval: indexRefreshInterval, IndexRefreshInterval: indexRefreshInterval,

View file

@ -1,69 +1,99 @@
package main package main
import ( import (
"context"
"net/http" "net/http"
"net/url" "net/url"
"strings" "strings"
"time" "time"
"github.com/chromedp/cdproto/emulation"
"github.com/chromedp/chromedp"
"github.com/go-shiori/go-readability" "github.com/go-shiori/go-readability"
"golang.org/x/net/html" "golang.org/x/net/html"
) )
// fetchPageMetadata tries extracting title/description/keywords from standard HTML, // fetchPageMetadataStandard tries standard HTML parse + go-readability only.
// OG, Twitter, then falls back to go-readability if needed. If after all that we func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
// still have no title or no description, we return ("", "", "") so the caller // 1. Standard HTML parse
// can skip saving it. title, desc, keywords := extractStandard(pageURL, userAgent)
//
// 1. <title>, <meta name="description"/>, <meta name="keywords"/> // 2. Fallback: go-readability
// 2. <meta property="og:title">, <meta property="og:description"> if title == "" || desc == "" {
// 3. <meta name="twitter:title">, <meta name="twitter:description"> title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
// 4. go-readability fallback (if title or description is still missing) }
// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
func fetchPageMetadata(pageURL string) (string, string, string) { // If still empty, return ("", "", "")
userAgent, err := GetUserAgent("crawler") if title == "" || desc == "" {
return "", "", ""
}
return sanitize(title), sanitize(desc), sanitize(keywords)
}
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
// Create context
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
var renderedHTML string
err := chromedp.Run(ctx,
emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
chromedp.Navigate(pageURL),
chromedp.Sleep(2*time.Second), // Let JS run a bit
chromedp.OuterHTML("html", &renderedHTML),
)
if err != nil { if err != nil {
printDebug("Failed to generate User-Agent: %v", err) printDebug("chromedp error for %s: %v", pageURL, err)
return "", "", "" return "", "", ""
} }
doc, err := html.Parse(strings.NewReader(renderedHTML))
if err != nil {
printDebug("chromedp parse error for %s: %v", pageURL, err)
return "", "", ""
}
return extractParsedDOM(doc)
}
// extractStandard does the normal HTML parse with OG, Twitter, etc.
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
client := &http.Client{Timeout: 15 * time.Second} client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", pageURL, nil) req, err := http.NewRequest("GET", pageURL, nil)
if err != nil { if err != nil {
printDebug("Failed to create request for %s: %v", pageURL, err) printDebug("Failed to create request for %s: %v", pageURL, err)
return "", "", "" return
} }
// Force English content when possible
req.Header.Set("User-Agent", userAgent) req.Header.Set("User-Agent", userAgent)
req.Header.Set("Accept-Language", "en-US,en;q=0.9") req.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := client.Do(req) resp, err := client.Do(req)
if err != nil { if err != nil {
printDebug("Failed to GET %s: %v", pageURL, err) printDebug("Failed to GET %s: %v", pageURL, err)
return "", "", "" return
} }
defer resp.Body.Close() defer resp.Body.Close()
// Skip non-2xx
if resp.StatusCode < 200 || resp.StatusCode >= 300 { if resp.StatusCode < 200 || resp.StatusCode >= 300 {
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
return "", "", "" return
} }
// First pass: standard HTML parse
doc, err := html.Parse(resp.Body) doc, err := html.Parse(resp.Body)
if err != nil { if err != nil {
printDebug("HTML parse error for %s: %v", pageURL, err) printDebug("HTML parse error for %s: %v", pageURL, err)
return "", "", "" return
} }
var ( return extractParsedDOM(doc)
title, desc, keywords string }
ogTitle, ogDesc string
twTitle, twDesc string // extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
foundTitle, foundDesc bool func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
) var ogTitle, ogDesc string
var twTitle, twDesc string
var foundTitle, foundDesc bool
var walk func(*html.Node) var walk func(*html.Node)
walk = func(n *html.Node) { walk = func(n *html.Node) {
@ -87,7 +117,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
} }
} }
// Standard meta tags
switch metaName { switch metaName {
case "description": case "description":
desc = contentVal desc = contentVal
@ -100,7 +129,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
twDesc = contentVal twDesc = contentVal
} }
// Open Graph tags
switch metaProperty { switch metaProperty {
case "og:title": case "og:title":
ogTitle = contentVal ogTitle = contentVal
@ -115,7 +143,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
} }
walk(doc) walk(doc)
// Fallback to OG or Twitter if <title>/description are missing // fallback to OG/Twitter if missing
if !foundTitle { if !foundTitle {
if ogTitle != "" { if ogTitle != "" {
title = ogTitle title = ogTitle
@ -131,43 +159,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
} }
} }
// If still missing title or desc, fallback to go-readability // Heuristic check
if title == "" || desc == "" {
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
// We must skip if we can't parse the URL for readability
return "", "", ""
}
readResp, readErr := client.Get(pageURL)
if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
defer readResp.Body.Close()
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr == nil {
// If we still have no title, try from readability
if title == "" && article.Title != "" {
title = article.Title
}
// If we still have no description, try article.Excerpt
if desc == "" && article.Excerpt != "" {
desc = article.Excerpt
} else if desc == "" && len(article.Content) > 0 {
// If excerpt is empty, use a snippet from article.Content
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
desc = snippet
}
} else {
printDebug("go-readability failed for %s: %v", pageURL, rdErr)
}
}
}
// Heuristic: discard obviously incorrect HTML-y strings or placeholders
if looksLikeRawHTML(title) { if looksLikeRawHTML(title) {
title = "" title = ""
} }
@ -175,16 +167,68 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
desc = "" desc = ""
} }
// If after all that we have no title or description, skip return title, desc, keywords
if title == "" || desc == "" {
return "", "", ""
} }
return sanitize(title), sanitize(desc), sanitize(keywords) // fallbackReadability tries go-readability if title/desc is missing.
func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
if title != "" && desc != "" {
return title, desc, keywords
} }
// looksLikeRawHTML is a simple heuristic to check for leftover HTML or client := &http.Client{Timeout: 15 * time.Second}
// go-readability noise (e.g., "readability-page-1"). readReq, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create fallbackReadability request: %v", err)
return title, desc, keywords
}
readReq.Header.Set("User-Agent", userAgent)
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
readResp, err := client.Do(readReq)
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
if err != nil {
printDebug("go-readability GET error for %s: %v", pageURL, err)
}
if readResp != nil {
readResp.Body.Close()
}
return title, desc, keywords
}
defer readResp.Body.Close()
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL: %v", parseErr)
return title, desc, keywords
}
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr != nil {
printDebug("go-readability error for %s: %v", pageURL, rdErr)
return title, desc, keywords
}
if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
title = article.Title
}
if desc == "" {
if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
desc = article.Excerpt
} else if len(article.Content) > 0 {
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
if !looksLikeRawHTML(snippet) {
desc = snippet
}
}
}
return title, desc, keywords
}
// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
func looksLikeRawHTML(text string) bool { func looksLikeRawHTML(text string) bool {
textLower := strings.ToLower(text) textLower := strings.ToLower(text)
if strings.Contains(textLower, "readability-page") { if strings.Contains(textLower, "readability-page") {
@ -196,7 +240,7 @@ func looksLikeRawHTML(text string) bool {
return false return false
} }
// sanitize removes pipes and newlines so they don't break our output format. // sanitize removes pipes/newlines so they don't break our output format.
func sanitize(input string) string { func sanitize(input string) string {
input = strings.ReplaceAll(input, "|", " ") input = strings.ReplaceAll(input, "|", " ")
input = strings.ReplaceAll(input, "\n", " ") input = strings.ReplaceAll(input, "\n", " ")

View file

@ -35,7 +35,7 @@ func runCrawlerAndIndexer() {
// 2. Crawl each domain and write results to data_to_index.txt // 2. Crawl each domain and write results to data_to_index.txt
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil { if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
printErr("Error crawling domains: %v", err) printErr("Error crawling domains: %v", err)
return return
} }
@ -75,18 +75,20 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
return result, scanner.Err() return result, scanner.Err()
} }
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile // crawlDomainsToFile does an async pipeline:
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error { // 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
existingEntries := make(map[string]bool) existingEntries := make(map[string]bool)
var mu sync.Mutex // Mutex to protect access to the map var mu sync.Mutex // For existingEntries + file writes
// read existing entries from outFile if it exists
if _, err := os.Stat(outFile); err == nil { if _, err := os.Stat(outFile); err == nil {
file, err := os.Open(outFile) file, err := os.Open(outFile)
if err != nil { if err != nil {
return fmt.Errorf("unable to open %s: %v", outFile, err) return fmt.Errorf("unable to open %s: %v", outFile, err)
} }
defer file.Close() defer file.Close()
scanner := bufio.NewScanner(file) scanner := bufio.NewScanner(file)
for scanner.Scan() { for scanner.Scan() {
line := scanner.Text() line := scanner.Text()
@ -104,47 +106,109 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
} }
defer file.Close() defer file.Close()
semaphore := make(chan struct{}, concurrentCrawlers) // Prepare channels
var wg sync.WaitGroup standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
chromeCh := make(chan [2]string, 1000)
for _, d := range domains { // 1) Spawn standard workers
wg.Add(1) var wgStandard sync.WaitGroup
semaphore <- struct{}{} for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
go func(domain [2]string) { wgStandard.Add(1)
defer wg.Done() go func() {
defer func() { <-semaphore }() defer wgStandard.Done()
for dom := range standardCh {
rank := domain[0] rank := dom[0]
domainName := domain[1] domainName := dom[1]
fullURL := "https://" + domainName fullURL := "https://" + domainName
// Mark domain existing so we don't re-crawl duplicates
mu.Lock() mu.Lock()
if domainName == "" || existingEntries[fullURL] { if domainName == "" || existingEntries[fullURL] {
mu.Unlock() mu.Unlock()
return continue
} }
existingEntries[fullURL] = true existingEntries[fullURL] = true
mu.Unlock() mu.Unlock()
title, desc, keywords := fetchPageMetadata(fullURL) // get a standard user agent
userAgent, _ := GetUserAgent("crawler-std")
title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
// Skip saving if title or description is missing
if title == "" || desc == "" { if title == "" || desc == "" {
printDebug("Skipping %s: missing title or description", fullURL) // push to chromeCh
return chromeCh <- dom
continue
} }
// write to file
line := fmt.Sprintf("%s|%s|%s|%s|%s\n", line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL, fullURL, title, keywords, desc, rank)
title,
keywords, mu.Lock()
desc,
rank,
)
file.WriteString(line) file.WriteString(line)
}(d) mu.Unlock()
}
}()
} }
wg.Wait() // 2) Spawn chrome workers
var wgChrome sync.WaitGroup
for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
wgChrome.Add(1)
go func() {
defer wgChrome.Done()
for dom := range chromeCh {
rank := dom[0]
domainName := dom[1]
fullURL := "https://" + domainName
// Mark domain existing if not already
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
mu.Unlock()
continue
}
existingEntries[fullURL] = true
mu.Unlock()
// get a chrome user agent
userAgent, _ := GetUserAgent("crawler-chrome")
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
if title == "" || desc == "" {
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
continue
}
// write to file
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL, title, keywords, desc, rank)
mu.Lock()
file.WriteString(line)
mu.Unlock()
}
}()
}
// Feed domains into standardCh
go func() {
for _, dom := range domains {
// optionally, if maxPages is relevant, you can track how many have been processed
standardCh <- dom
}
// close the standardCh once all are queued
close(standardCh)
}()
// Wait for standard workers to finish, then close chromeCh
go func() {
wgStandard.Wait()
close(chromeCh)
}()
// Wait for chrome workers to finish
wgChrome.Wait()
return nil return nil
} }

8
go.mod
View file

@ -41,13 +41,21 @@ require (
github.com/blevesearch/zapx/v14 v14.3.10 // indirect github.com/blevesearch/zapx/v14 v14.3.10 // indirect
github.com/blevesearch/zapx/v15 v15.3.17 // indirect github.com/blevesearch/zapx/v15 v15.3.17 // indirect
github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect
github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb // indirect
github.com/chromedp/chromedp v0.11.2 // indirect
github.com/chromedp/sysutil v1.1.0 // indirect
github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-ole/go-ole v1.3.0 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect
github.com/golang/protobuf v1.5.4 // indirect github.com/golang/protobuf v1.5.4 // indirect
github.com/golang/snappy v0.0.4 // indirect github.com/golang/snappy v0.0.4 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/mschoch/smat v0.2.0 // indirect github.com/mschoch/smat v0.2.0 // indirect

17
go.sum
View file

@ -47,6 +47,12 @@ github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5Y
github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI= github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI=
github.com/chai2010/webp v1.1.1 h1:jTRmEccAJ4MGrhFOrPMpNGIJ/eybIgwKpcACsrTEapk= github.com/chai2010/webp v1.1.1 h1:jTRmEccAJ4MGrhFOrPMpNGIJ/eybIgwKpcACsrTEapk=
github.com/chai2010/webp v1.1.1/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU= github.com/chai2010/webp v1.1.1/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU=
github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb h1:noKVm2SsG4v0Yd0lHNtFYc9EUxIVvrr4kJ6hM8wvIYU=
github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb/go.mod h1:4XqMl3iIW08jtieURWL6Tt5924w21pxirC6th662XUM=
github.com/chromedp/chromedp v0.11.2 h1:ZRHTh7DjbNTlfIv3NFTbB7eVeu5XCNkgrpcGSpn2oX0=
github.com/chromedp/chromedp v0.11.2/go.mod h1:lr8dFRLKsdTTWb75C/Ttol2vnBKOSnt0BW8R9Xaupi8=
github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM=
github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -57,6 +63,12 @@ github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziH
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI= github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI=
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w= github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I= github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I=
@ -68,10 +80,14 @@ github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8=
github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
@ -137,6 +153,7 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=