From c71808aa1e116c8c9238e620f255ee5d8ba3f4bb Mon Sep 17 00:00:00 2001 From: partisan Date: Wed, 1 Jan 2025 14:50:12 +0100 Subject: [PATCH] improved crawler data extraction (added chromedp) --- .gitignore | 3 +- config.go | 101 +++++++++++----------- crawler-extraction.go | 196 ++++++++++++++++++++++++++---------------- crawler.go | 146 ++++++++++++++++++++++--------- go.mod | 8 ++ go.sum | 17 ++++ 6 files changed, 305 insertions(+), 166 deletions(-) diff --git a/.gitignore b/.gitignore index 118b838..5f5aeab 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ image_cache/ cache/ *.min.js *.min.css -qgato \ No newline at end of file +qgato +test.py \ No newline at end of file diff --git a/config.go b/config.go index 4ea4eb2..bdd9ccc 100644 --- a/config.go +++ b/config.go @@ -23,43 +23,45 @@ type CacheConfig struct { } type Config struct { - Port int // Added - AuthCode string // Added - PeerID string // Added - Peers []string - Domain string // Added - NodesEnabled bool // Added - CrawlerEnabled bool // Added - IndexerEnabled bool // Added - WebsiteEnabled bool // Added - RamCacheEnabled bool - DriveCacheEnabled bool // Added - LogLevel int // Added - ConcurrentCrawlers int // Number of concurrent crawlers - CrawlingInterval time.Duration // Refres crawled results in... - MaxPagesPerDomain int // Max pages to crawl per domain - IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") + Port int // Added + AuthCode string // Added + PeerID string // Added + Peers []string + Domain string // Added + NodesEnabled bool // Added + CrawlerEnabled bool // Added + IndexerEnabled bool // Added + WebsiteEnabled bool // Added + RamCacheEnabled bool + DriveCacheEnabled bool // Added + LogLevel int // Added + ConcurrentStandardCrawlers int + ConcurrentChromeCrawlers int + CrawlingInterval time.Duration // Refres crawled results in... + MaxPagesPerDomain int // Max pages to crawl per domain + IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") DriveCache CacheConfig RamCache CacheConfig } var defaultConfig = Config{ - Port: 5000, - Domain: "localhost", - Peers: []string{}, - AuthCode: generateStrongRandomString(64), - NodesEnabled: false, - CrawlerEnabled: true, - IndexerEnabled: false, - WebsiteEnabled: true, - RamCacheEnabled: true, - DriveCacheEnabled: false, - ConcurrentCrawlers: 5, - CrawlingInterval: 24 * time.Hour, - MaxPagesPerDomain: 10, - IndexRefreshInterval: 2 * time.Minute, - LogLevel: 1, + Port: 5000, + Domain: "localhost", + Peers: []string{}, + AuthCode: generateStrongRandomString(64), + NodesEnabled: false, + CrawlerEnabled: true, + IndexerEnabled: false, + WebsiteEnabled: true, + RamCacheEnabled: true, + DriveCacheEnabled: false, + ConcurrentStandardCrawlers: 12, + ConcurrentChromeCrawlers: 4, + CrawlingInterval: 24 * time.Hour, + MaxPagesPerDomain: 10, + IndexRefreshInterval: 2 * time.Minute, + LogLevel: 1, DriveCache: CacheConfig{ Duration: 48 * time.Hour, // Added Path: "./cache", // Added @@ -249,7 +251,8 @@ func saveConfig(config Config) { // Indexer section indexerSec := cfg.Section("Indexer") - indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers)) + indexerSec.Key("ConcurrentStandardCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) + indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String()) @@ -296,7 +299,8 @@ func loadConfig() Config { driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled) // Indexing - concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi) + concurrentStandardCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentStandardCrawlers"), defaultConfig.ConcurrentStandardCrawlers, strconv.Atoi) + concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi) crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration) @@ -315,21 +319,22 @@ func loadConfig() Config { ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes))) return Config{ - Port: port, - Domain: domain, - LogLevel: logLevel, - AuthCode: authCode, - Peers: peers, - NodesEnabled: nodesEnabled, - CrawlerEnabled: crawlerEnabled, - IndexerEnabled: indexerEnabled, - WebsiteEnabled: websiteEnabled, - RamCacheEnabled: ramCacheEnabled, - DriveCacheEnabled: driveCacheEnabled, - ConcurrentCrawlers: concurrentCrawlers, - CrawlingInterval: crawlingInterval, - MaxPagesPerDomain: maxPagesPerDomain, - IndexRefreshInterval: indexRefreshInterval, + Port: port, + Domain: domain, + LogLevel: logLevel, + AuthCode: authCode, + Peers: peers, + NodesEnabled: nodesEnabled, + CrawlerEnabled: crawlerEnabled, + IndexerEnabled: indexerEnabled, + WebsiteEnabled: websiteEnabled, + RamCacheEnabled: ramCacheEnabled, + DriveCacheEnabled: driveCacheEnabled, + ConcurrentStandardCrawlers: concurrentStandardCrawlers, + ConcurrentChromeCrawlers: concurrentChromeCrawlers, + CrawlingInterval: crawlingInterval, + MaxPagesPerDomain: maxPagesPerDomain, + IndexRefreshInterval: indexRefreshInterval, DriveCache: CacheConfig{ Duration: driveDuration, MaxUsageBytes: driveMaxUsage, diff --git a/crawler-extraction.go b/crawler-extraction.go index 1594bef..4ce8b9d 100644 --- a/crawler-extraction.go +++ b/crawler-extraction.go @@ -1,69 +1,99 @@ package main import ( + "context" "net/http" "net/url" "strings" "time" + "github.com/chromedp/cdproto/emulation" + "github.com/chromedp/chromedp" "github.com/go-shiori/go-readability" "golang.org/x/net/html" ) -// fetchPageMetadata tries extracting title/description/keywords from standard HTML, -// OG, Twitter, then falls back to go-readability if needed. If after all that we -// still have no title or no description, we return ("", "", "") so the caller -// can skip saving it. -// -// 1. , <meta name="description"/>, <meta name="keywords"/> -// 2. <meta property="og:title">, <meta property="og:description"> -// 3. <meta name="twitter:title">, <meta name="twitter:description"> -// 4. go-readability fallback (if title or description is still missing) -// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”). -func fetchPageMetadata(pageURL string) (string, string, string) { - userAgent, err := GetUserAgent("crawler") +// fetchPageMetadataStandard tries standard HTML parse + go-readability only. +func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) { + // 1. Standard HTML parse + title, desc, keywords := extractStandard(pageURL, userAgent) + + // 2. Fallback: go-readability + if title == "" || desc == "" { + title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords) + } + + // If still empty, return ("", "", "") + if title == "" || desc == "" { + return "", "", "" + } + return sanitize(title), sanitize(desc), sanitize(keywords) +} + +// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages. +func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) { + // Create context + ctx, cancel := chromedp.NewContext(context.Background()) + defer cancel() + + var renderedHTML string + err := chromedp.Run(ctx, + emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"), + chromedp.Navigate(pageURL), + chromedp.Sleep(2*time.Second), // Let JS run a bit + chromedp.OuterHTML("html", &renderedHTML), + ) if err != nil { - printDebug("Failed to generate User-Agent: %v", err) + printDebug("chromedp error for %s: %v", pageURL, err) return "", "", "" } + doc, err := html.Parse(strings.NewReader(renderedHTML)) + if err != nil { + printDebug("chromedp parse error for %s: %v", pageURL, err) + return "", "", "" + } + + return extractParsedDOM(doc) +} + +// extractStandard does the normal HTML parse with OG, Twitter, etc. +func extractStandard(pageURL, userAgent string) (title, desc, keywords string) { client := &http.Client{Timeout: 15 * time.Second} req, err := http.NewRequest("GET", pageURL, nil) if err != nil { printDebug("Failed to create request for %s: %v", pageURL, err) - return "", "", "" + return } - - // Force English content when possible req.Header.Set("User-Agent", userAgent) req.Header.Set("Accept-Language", "en-US,en;q=0.9") resp, err := client.Do(req) if err != nil { printDebug("Failed to GET %s: %v", pageURL, err) - return "", "", "" + return } defer resp.Body.Close() - // Skip non-2xx if resp.StatusCode < 200 || resp.StatusCode >= 300 { printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode) - return "", "", "" + return } - // First pass: standard HTML parse doc, err := html.Parse(resp.Body) if err != nil { printDebug("HTML parse error for %s: %v", pageURL, err) - return "", "", "" + return } - var ( - title, desc, keywords string - ogTitle, ogDesc string - twTitle, twDesc string - foundTitle, foundDesc bool - ) + return extractParsedDOM(doc) +} + +// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter. +func extractParsedDOM(doc *html.Node) (title, desc, keywords string) { + var ogTitle, ogDesc string + var twTitle, twDesc string + var foundTitle, foundDesc bool var walk func(*html.Node) walk = func(n *html.Node) { @@ -87,7 +117,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) { } } - // Standard meta tags switch metaName { case "description": desc = contentVal @@ -100,7 +129,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) { twDesc = contentVal } - // Open Graph tags switch metaProperty { case "og:title": ogTitle = contentVal @@ -115,7 +143,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) { } walk(doc) - // Fallback to OG or Twitter if <title>/description are missing + // fallback to OG/Twitter if missing if !foundTitle { if ogTitle != "" { title = ogTitle @@ -131,43 +159,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) { } } - // If still missing title or desc, fallback to go-readability - if title == "" || desc == "" { - parsedURL, parseErr := url.Parse(pageURL) - if parseErr != nil { - printDebug("Failed to parse URL %s: %v", pageURL, parseErr) - // We must skip if we can't parse the URL for readability - return "", "", "" - } - - readResp, readErr := client.Get(pageURL) - if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 { - defer readResp.Body.Close() - - article, rdErr := readability.FromReader(readResp.Body, parsedURL) - if rdErr == nil { - // If we still have no title, try from readability - if title == "" && article.Title != "" { - title = article.Title - } - // If we still have no description, try article.Excerpt - if desc == "" && article.Excerpt != "" { - desc = article.Excerpt - } else if desc == "" && len(article.Content) > 0 { - // If excerpt is empty, use a snippet from article.Content - snippet := article.Content - if len(snippet) > 200 { - snippet = snippet[:200] + "..." - } - desc = snippet - } - } else { - printDebug("go-readability failed for %s: %v", pageURL, rdErr) - } - } - } - - // Heuristic: discard obviously incorrect HTML-y strings or placeholders + // Heuristic check if looksLikeRawHTML(title) { title = "" } @@ -175,16 +167,68 @@ func fetchPageMetadata(pageURL string) (string, string, string) { desc = "" } - // If after all that we have no title or description, skip - if title == "" || desc == "" { - return "", "", "" - } - - return sanitize(title), sanitize(desc), sanitize(keywords) + return title, desc, keywords } -// looksLikeRawHTML is a simple heuristic to check for leftover HTML or -// go-readability noise (e.g., "readability-page-1"). +// fallbackReadability tries go-readability if title/desc is missing. +func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) { + if title != "" && desc != "" { + return title, desc, keywords + } + + client := &http.Client{Timeout: 15 * time.Second} + readReq, err := http.NewRequest("GET", pageURL, nil) + if err != nil { + printDebug("Failed to create fallbackReadability request: %v", err) + return title, desc, keywords + } + readReq.Header.Set("User-Agent", userAgent) + readReq.Header.Set("Accept-Language", "en-US,en;q=0.9") + + readResp, err := client.Do(readReq) + if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 { + if err != nil { + printDebug("go-readability GET error for %s: %v", pageURL, err) + } + if readResp != nil { + readResp.Body.Close() + } + return title, desc, keywords + } + defer readResp.Body.Close() + + parsedURL, parseErr := url.Parse(pageURL) + if parseErr != nil { + printDebug("Failed to parse URL: %v", parseErr) + return title, desc, keywords + } + + article, rdErr := readability.FromReader(readResp.Body, parsedURL) + if rdErr != nil { + printDebug("go-readability error for %s: %v", pageURL, rdErr) + return title, desc, keywords + } + + if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) { + title = article.Title + } + if desc == "" { + if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) { + desc = article.Excerpt + } else if len(article.Content) > 0 { + snippet := article.Content + if len(snippet) > 200 { + snippet = snippet[:200] + "..." + } + if !looksLikeRawHTML(snippet) { + desc = snippet + } + } + } + return title, desc, keywords +} + +// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text func looksLikeRawHTML(text string) bool { textLower := strings.ToLower(text) if strings.Contains(textLower, "readability-page") { @@ -196,7 +240,7 @@ func looksLikeRawHTML(text string) bool { return false } -// sanitize removes pipes and newlines so they don't break our output format. +// sanitize removes pipes/newlines so they don't break our output format. func sanitize(input string) string { input = strings.ReplaceAll(input, "|", " ") input = strings.ReplaceAll(input, "\n", " ") diff --git a/crawler.go b/crawler.go index 2a934f6..45dc76f 100644 --- a/crawler.go +++ b/crawler.go @@ -35,7 +35,7 @@ func runCrawlerAndIndexer() { // 2. Crawl each domain and write results to data_to_index.txt outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") - if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil { + if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil { printErr("Error crawling domains: %v", err) return } @@ -75,18 +75,20 @@ func readDomainsCSV(csvPath string) ([][2]string, error) { return result, scanner.Err() } -// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile -func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error { +// crawlDomainsToFile does an async pipeline: +// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh +// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip +func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error { existingEntries := make(map[string]bool) - var mu sync.Mutex // Mutex to protect access to the map + var mu sync.Mutex // For existingEntries + file writes + // read existing entries from outFile if it exists if _, err := os.Stat(outFile); err == nil { file, err := os.Open(outFile) if err != nil { return fmt.Errorf("unable to open %s: %v", outFile, err) } defer file.Close() - scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() @@ -104,47 +106,109 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu } defer file.Close() - semaphore := make(chan struct{}, concurrentCrawlers) - var wg sync.WaitGroup + // Prepare channels + standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking + chromeCh := make(chan [2]string, 1000) - for _, d := range domains { - wg.Add(1) - semaphore <- struct{}{} - go func(domain [2]string) { - defer wg.Done() - defer func() { <-semaphore }() + // 1) Spawn standard workers + var wgStandard sync.WaitGroup + for i := 0; i < config.ConcurrentStandardCrawlers; i++ { + wgStandard.Add(1) + go func() { + defer wgStandard.Done() + for dom := range standardCh { + rank := dom[0] + domainName := dom[1] + fullURL := "https://" + domainName - rank := domain[0] - domainName := domain[1] - fullURL := "https://" + domainName - - mu.Lock() - if domainName == "" || existingEntries[fullURL] { + // Mark domain existing so we don't re-crawl duplicates + mu.Lock() + if domainName == "" || existingEntries[fullURL] { + mu.Unlock() + continue + } + existingEntries[fullURL] = true + mu.Unlock() + + // get a standard user agent + userAgent, _ := GetUserAgent("crawler-std") + title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent) + + if title == "" || desc == "" { + // push to chromeCh + chromeCh <- dom + continue + } + + // write to file + line := fmt.Sprintf("%s|%s|%s|%s|%s\n", + fullURL, title, keywords, desc, rank) + + mu.Lock() + file.WriteString(line) mu.Unlock() - return } - existingEntries[fullURL] = true - mu.Unlock() - - title, desc, keywords := fetchPageMetadata(fullURL) - - // Skip saving if title or description is missing - if title == "" || desc == "" { - printDebug("Skipping %s: missing title or description", fullURL) - return - } - - line := fmt.Sprintf("%s|%s|%s|%s|%s\n", - fullURL, - title, - keywords, - desc, - rank, - ) - file.WriteString(line) - }(d) + }() } - wg.Wait() + // 2) Spawn chrome workers + var wgChrome sync.WaitGroup + for i := 0; i < config.ConcurrentChromeCrawlers; i++ { + wgChrome.Add(1) + go func() { + defer wgChrome.Done() + for dom := range chromeCh { + rank := dom[0] + domainName := dom[1] + fullURL := "https://" + domainName + + // Mark domain existing if not already + mu.Lock() + if domainName == "" || existingEntries[fullURL] { + mu.Unlock() + continue + } + existingEntries[fullURL] = true + mu.Unlock() + + // get a chrome user agent + userAgent, _ := GetUserAgent("crawler-chrome") + title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent) + + if title == "" || desc == "" { + printWarn("Skipping (Chrome) %s: missing title/desc", fullURL) + continue + } + + // write to file + line := fmt.Sprintf("%s|%s|%s|%s|%s\n", + fullURL, title, keywords, desc, rank) + + mu.Lock() + file.WriteString(line) + mu.Unlock() + } + }() + } + + // Feed domains into standardCh + go func() { + for _, dom := range domains { + // optionally, if maxPages is relevant, you can track how many have been processed + standardCh <- dom + } + // close the standardCh once all are queued + close(standardCh) + }() + + // Wait for standard workers to finish, then close chromeCh + go func() { + wgStandard.Wait() + close(chromeCh) + }() + + // Wait for chrome workers to finish + wgChrome.Wait() + return nil } diff --git a/go.mod b/go.mod index a293a75..c8200d3 100644 --- a/go.mod +++ b/go.mod @@ -41,13 +41,21 @@ require ( github.com/blevesearch/zapx/v14 v14.3.10 // indirect github.com/blevesearch/zapx/v15 v15.3.17 // indirect github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect + github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb // indirect + github.com/chromedp/chromedp v0.11.2 // indirect + github.com/chromedp/sysutil v1.1.0 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect + github.com/gobwas/httphead v0.1.0 // indirect + github.com/gobwas/pool v0.2.1 // indirect + github.com/gobwas/ws v1.4.0 // indirect github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/golang/snappy v0.0.4 // indirect + github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/mschoch/smat v0.2.0 // indirect diff --git a/go.sum b/go.sum index 59414b4..148146f 100644 --- a/go.sum +++ b/go.sum @@ -47,6 +47,12 @@ github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5Y github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI= github.com/chai2010/webp v1.1.1 h1:jTRmEccAJ4MGrhFOrPMpNGIJ/eybIgwKpcACsrTEapk= github.com/chai2010/webp v1.1.1/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU= +github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb h1:noKVm2SsG4v0Yd0lHNtFYc9EUxIVvrr4kJ6hM8wvIYU= +github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb/go.mod h1:4XqMl3iIW08jtieURWL6Tt5924w21pxirC6th662XUM= +github.com/chromedp/chromedp v0.11.2 h1:ZRHTh7DjbNTlfIv3NFTbB7eVeu5XCNkgrpcGSpn2oX0= +github.com/chromedp/chromedp v0.11.2/go.mod h1:lr8dFRLKsdTTWb75C/Ttol2vnBKOSnt0BW8R9Xaupi8= +github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM= +github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -57,6 +63,12 @@ github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziH github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM= github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI= github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w= +github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU= +github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= +github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= +github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs= github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14= github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I= @@ -68,10 +80,14 @@ github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8= github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= @@ -137,6 +153,7 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=