improved crawler data extraction (added chromedp)

This commit is contained in:
partisan 2025-01-01 14:50:12 +01:00
parent 3494457336
commit c71808aa1e
6 changed files with 305 additions and 166 deletions

3
.gitignore vendored
View file

@ -5,4 +5,5 @@ image_cache/
cache/
*.min.js
*.min.css
qgato
qgato
test.py

101
config.go
View file

@ -23,43 +23,45 @@ type CacheConfig struct {
}
type Config struct {
Port int // Added
AuthCode string // Added
PeerID string // Added
Peers []string
Domain string // Added
NodesEnabled bool // Added
CrawlerEnabled bool // Added
IndexerEnabled bool // Added
WebsiteEnabled bool // Added
RamCacheEnabled bool
DriveCacheEnabled bool // Added
LogLevel int // Added
ConcurrentCrawlers int // Number of concurrent crawlers
CrawlingInterval time.Duration // Refres crawled results in...
MaxPagesPerDomain int // Max pages to crawl per domain
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
Port int // Added
AuthCode string // Added
PeerID string // Added
Peers []string
Domain string // Added
NodesEnabled bool // Added
CrawlerEnabled bool // Added
IndexerEnabled bool // Added
WebsiteEnabled bool // Added
RamCacheEnabled bool
DriveCacheEnabled bool // Added
LogLevel int // Added
ConcurrentStandardCrawlers int
ConcurrentChromeCrawlers int
CrawlingInterval time.Duration // Refres crawled results in...
MaxPagesPerDomain int // Max pages to crawl per domain
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
DriveCache CacheConfig
RamCache CacheConfig
}
var defaultConfig = Config{
Port: 5000,
Domain: "localhost",
Peers: []string{},
AuthCode: generateStrongRandomString(64),
NodesEnabled: false,
CrawlerEnabled: true,
IndexerEnabled: false,
WebsiteEnabled: true,
RamCacheEnabled: true,
DriveCacheEnabled: false,
ConcurrentCrawlers: 5,
CrawlingInterval: 24 * time.Hour,
MaxPagesPerDomain: 10,
IndexRefreshInterval: 2 * time.Minute,
LogLevel: 1,
Port: 5000,
Domain: "localhost",
Peers: []string{},
AuthCode: generateStrongRandomString(64),
NodesEnabled: false,
CrawlerEnabled: true,
IndexerEnabled: false,
WebsiteEnabled: true,
RamCacheEnabled: true,
DriveCacheEnabled: false,
ConcurrentStandardCrawlers: 12,
ConcurrentChromeCrawlers: 4,
CrawlingInterval: 24 * time.Hour,
MaxPagesPerDomain: 10,
IndexRefreshInterval: 2 * time.Minute,
LogLevel: 1,
DriveCache: CacheConfig{
Duration: 48 * time.Hour, // Added
Path: "./cache", // Added
@ -249,7 +251,8 @@ func saveConfig(config Config) {
// Indexer section
indexerSec := cfg.Section("Indexer")
indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers))
indexerSec.Key("ConcurrentStandardCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
@ -296,7 +299,8 @@ func loadConfig() Config {
driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled)
// Indexing
concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi)
concurrentStandardCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentStandardCrawlers"), defaultConfig.ConcurrentStandardCrawlers, strconv.Atoi)
concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi)
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
@ -315,21 +319,22 @@ func loadConfig() Config {
ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes)))
return Config{
Port: port,
Domain: domain,
LogLevel: logLevel,
AuthCode: authCode,
Peers: peers,
NodesEnabled: nodesEnabled,
CrawlerEnabled: crawlerEnabled,
IndexerEnabled: indexerEnabled,
WebsiteEnabled: websiteEnabled,
RamCacheEnabled: ramCacheEnabled,
DriveCacheEnabled: driveCacheEnabled,
ConcurrentCrawlers: concurrentCrawlers,
CrawlingInterval: crawlingInterval,
MaxPagesPerDomain: maxPagesPerDomain,
IndexRefreshInterval: indexRefreshInterval,
Port: port,
Domain: domain,
LogLevel: logLevel,
AuthCode: authCode,
Peers: peers,
NodesEnabled: nodesEnabled,
CrawlerEnabled: crawlerEnabled,
IndexerEnabled: indexerEnabled,
WebsiteEnabled: websiteEnabled,
RamCacheEnabled: ramCacheEnabled,
DriveCacheEnabled: driveCacheEnabled,
ConcurrentStandardCrawlers: concurrentStandardCrawlers,
ConcurrentChromeCrawlers: concurrentChromeCrawlers,
CrawlingInterval: crawlingInterval,
MaxPagesPerDomain: maxPagesPerDomain,
IndexRefreshInterval: indexRefreshInterval,
DriveCache: CacheConfig{
Duration: driveDuration,
MaxUsageBytes: driveMaxUsage,

View file

@ -1,69 +1,99 @@
package main
import (
"context"
"net/http"
"net/url"
"strings"
"time"
"github.com/chromedp/cdproto/emulation"
"github.com/chromedp/chromedp"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
)
// fetchPageMetadata tries extracting title/description/keywords from standard HTML,
// OG, Twitter, then falls back to go-readability if needed. If after all that we
// still have no title or no description, we return ("", "", "") so the caller
// can skip saving it.
//
// 1. <title>, <meta name="description"/>, <meta name="keywords"/>
// 2. <meta property="og:title">, <meta property="og:description">
// 3. <meta name="twitter:title">, <meta name="twitter:description">
// 4. go-readability fallback (if title or description is still missing)
// 5. Basic heuristic to detect “wrong” content from readability (e.g. raw HTML or “readability-page-1”).
func fetchPageMetadata(pageURL string) (string, string, string) {
userAgent, err := GetUserAgent("crawler")
// fetchPageMetadataStandard tries standard HTML parse + go-readability only.
func fetchPageMetadataStandard(pageURL, userAgent string) (string, string, string) {
// 1. Standard HTML parse
title, desc, keywords := extractStandard(pageURL, userAgent)
// 2. Fallback: go-readability
if title == "" || desc == "" {
title, desc, keywords = fallbackReadability(pageURL, userAgent, title, desc, keywords)
}
// If still empty, return ("", "", "")
if title == "" || desc == "" {
return "", "", ""
}
return sanitize(title), sanitize(desc), sanitize(keywords)
}
// fetchPageMetadataChrome uses Chromedp to handle JavaScript-rendered pages.
func fetchPageMetadataChrome(pageURL, userAgent string) (string, string, string) {
// Create context
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
var renderedHTML string
err := chromedp.Run(ctx,
emulation.SetUserAgentOverride(userAgent).WithAcceptLanguage("en-US,en;q=0.9"),
chromedp.Navigate(pageURL),
chromedp.Sleep(2*time.Second), // Let JS run a bit
chromedp.OuterHTML("html", &renderedHTML),
)
if err != nil {
printDebug("Failed to generate User-Agent: %v", err)
printDebug("chromedp error for %s: %v", pageURL, err)
return "", "", ""
}
doc, err := html.Parse(strings.NewReader(renderedHTML))
if err != nil {
printDebug("chromedp parse error for %s: %v", pageURL, err)
return "", "", ""
}
return extractParsedDOM(doc)
}
// extractStandard does the normal HTML parse with OG, Twitter, etc.
func extractStandard(pageURL, userAgent string) (title, desc, keywords string) {
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create request for %s: %v", pageURL, err)
return "", "", ""
return
}
// Force English content when possible
req.Header.Set("User-Agent", userAgent)
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := client.Do(req)
if err != nil {
printDebug("Failed to GET %s: %v", pageURL, err)
return "", "", ""
return
}
defer resp.Body.Close()
// Skip non-2xx
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
printDebug("Skipping %s due to HTTP status: %d", pageURL, resp.StatusCode)
return "", "", ""
return
}
// First pass: standard HTML parse
doc, err := html.Parse(resp.Body)
if err != nil {
printDebug("HTML parse error for %s: %v", pageURL, err)
return "", "", ""
return
}
var (
title, desc, keywords string
ogTitle, ogDesc string
twTitle, twDesc string
foundTitle, foundDesc bool
)
return extractParsedDOM(doc)
}
// extractParsedDOM uses the same logic to parse <title>, meta, OG, Twitter.
func extractParsedDOM(doc *html.Node) (title, desc, keywords string) {
var ogTitle, ogDesc string
var twTitle, twDesc string
var foundTitle, foundDesc bool
var walk func(*html.Node)
walk = func(n *html.Node) {
@ -87,7 +117,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
}
}
// Standard meta tags
switch metaName {
case "description":
desc = contentVal
@ -100,7 +129,6 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
twDesc = contentVal
}
// Open Graph tags
switch metaProperty {
case "og:title":
ogTitle = contentVal
@ -115,7 +143,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
}
walk(doc)
// Fallback to OG or Twitter if <title>/description are missing
// fallback to OG/Twitter if missing
if !foundTitle {
if ogTitle != "" {
title = ogTitle
@ -131,43 +159,7 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
}
}
// If still missing title or desc, fallback to go-readability
if title == "" || desc == "" {
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL %s: %v", pageURL, parseErr)
// We must skip if we can't parse the URL for readability
return "", "", ""
}
readResp, readErr := client.Get(pageURL)
if readErr == nil && readResp.StatusCode >= 200 && readResp.StatusCode < 300 {
defer readResp.Body.Close()
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr == nil {
// If we still have no title, try from readability
if title == "" && article.Title != "" {
title = article.Title
}
// If we still have no description, try article.Excerpt
if desc == "" && article.Excerpt != "" {
desc = article.Excerpt
} else if desc == "" && len(article.Content) > 0 {
// If excerpt is empty, use a snippet from article.Content
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
desc = snippet
}
} else {
printDebug("go-readability failed for %s: %v", pageURL, rdErr)
}
}
}
// Heuristic: discard obviously incorrect HTML-y strings or placeholders
// Heuristic check
if looksLikeRawHTML(title) {
title = ""
}
@ -175,16 +167,68 @@ func fetchPageMetadata(pageURL string) (string, string, string) {
desc = ""
}
// If after all that we have no title or description, skip
if title == "" || desc == "" {
return "", "", ""
}
return sanitize(title), sanitize(desc), sanitize(keywords)
return title, desc, keywords
}
// looksLikeRawHTML is a simple heuristic to check for leftover HTML or
// go-readability noise (e.g., "readability-page-1").
// fallbackReadability tries go-readability if title/desc is missing.
func fallbackReadability(pageURL, userAgent, title, desc, keywords string) (string, string, string) {
if title != "" && desc != "" {
return title, desc, keywords
}
client := &http.Client{Timeout: 15 * time.Second}
readReq, err := http.NewRequest("GET", pageURL, nil)
if err != nil {
printDebug("Failed to create fallbackReadability request: %v", err)
return title, desc, keywords
}
readReq.Header.Set("User-Agent", userAgent)
readReq.Header.Set("Accept-Language", "en-US,en;q=0.9")
readResp, err := client.Do(readReq)
if err != nil || readResp.StatusCode < 200 || readResp.StatusCode >= 300 {
if err != nil {
printDebug("go-readability GET error for %s: %v", pageURL, err)
}
if readResp != nil {
readResp.Body.Close()
}
return title, desc, keywords
}
defer readResp.Body.Close()
parsedURL, parseErr := url.Parse(pageURL)
if parseErr != nil {
printDebug("Failed to parse URL: %v", parseErr)
return title, desc, keywords
}
article, rdErr := readability.FromReader(readResp.Body, parsedURL)
if rdErr != nil {
printDebug("go-readability error for %s: %v", pageURL, rdErr)
return title, desc, keywords
}
if title == "" && article.Title != "" && !looksLikeRawHTML(article.Title) {
title = article.Title
}
if desc == "" {
if article.Excerpt != "" && !looksLikeRawHTML(article.Excerpt) {
desc = article.Excerpt
} else if len(article.Content) > 0 {
snippet := article.Content
if len(snippet) > 200 {
snippet = snippet[:200] + "..."
}
if !looksLikeRawHTML(snippet) {
desc = snippet
}
}
}
return title, desc, keywords
}
// looksLikeRawHTML is a simple heuristic check for leftover or invalid HTML text
func looksLikeRawHTML(text string) bool {
textLower := strings.ToLower(text)
if strings.Contains(textLower, "readability-page") {
@ -196,7 +240,7 @@ func looksLikeRawHTML(text string) bool {
return false
}
// sanitize removes pipes and newlines so they don't break our output format.
// sanitize removes pipes/newlines so they don't break our output format.
func sanitize(input string) string {
input = strings.ReplaceAll(input, "|", " ")
input = strings.ReplaceAll(input, "\n", " ")

View file

@ -35,7 +35,7 @@ func runCrawlerAndIndexer() {
// 2. Crawl each domain and write results to data_to_index.txt
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
printErr("Error crawling domains: %v", err)
return
}
@ -75,18 +75,20 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
return result, scanner.Err()
}
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
// crawlDomainsToFile does an async pipeline:
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
existingEntries := make(map[string]bool)
var mu sync.Mutex // Mutex to protect access to the map
var mu sync.Mutex // For existingEntries + file writes
// read existing entries from outFile if it exists
if _, err := os.Stat(outFile); err == nil {
file, err := os.Open(outFile)
if err != nil {
return fmt.Errorf("unable to open %s: %v", outFile, err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
@ -104,47 +106,109 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concu
}
defer file.Close()
semaphore := make(chan struct{}, concurrentCrawlers)
var wg sync.WaitGroup
// Prepare channels
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
chromeCh := make(chan [2]string, 1000)
for _, d := range domains {
wg.Add(1)
semaphore <- struct{}{}
go func(domain [2]string) {
defer wg.Done()
defer func() { <-semaphore }()
// 1) Spawn standard workers
var wgStandard sync.WaitGroup
for i := 0; i < config.ConcurrentStandardCrawlers; i++ {
wgStandard.Add(1)
go func() {
defer wgStandard.Done()
for dom := range standardCh {
rank := dom[0]
domainName := dom[1]
fullURL := "https://" + domainName
rank := domain[0]
domainName := domain[1]
fullURL := "https://" + domainName
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
// Mark domain existing so we don't re-crawl duplicates
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
mu.Unlock()
continue
}
existingEntries[fullURL] = true
mu.Unlock()
// get a standard user agent
userAgent, _ := GetUserAgent("crawler-std")
title, desc, keywords := fetchPageMetadataStandard(fullURL, userAgent)
if title == "" || desc == "" {
// push to chromeCh
chromeCh <- dom
continue
}
// write to file
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL, title, keywords, desc, rank)
mu.Lock()
file.WriteString(line)
mu.Unlock()
return
}
existingEntries[fullURL] = true
mu.Unlock()
title, desc, keywords := fetchPageMetadata(fullURL)
// Skip saving if title or description is missing
if title == "" || desc == "" {
printDebug("Skipping %s: missing title or description", fullURL)
return
}
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL,
title,
keywords,
desc,
rank,
)
file.WriteString(line)
}(d)
}()
}
wg.Wait()
// 2) Spawn chrome workers
var wgChrome sync.WaitGroup
for i := 0; i < config.ConcurrentChromeCrawlers; i++ {
wgChrome.Add(1)
go func() {
defer wgChrome.Done()
for dom := range chromeCh {
rank := dom[0]
domainName := dom[1]
fullURL := "https://" + domainName
// Mark domain existing if not already
mu.Lock()
if domainName == "" || existingEntries[fullURL] {
mu.Unlock()
continue
}
existingEntries[fullURL] = true
mu.Unlock()
// get a chrome user agent
userAgent, _ := GetUserAgent("crawler-chrome")
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
if title == "" || desc == "" {
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
continue
}
// write to file
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL, title, keywords, desc, rank)
mu.Lock()
file.WriteString(line)
mu.Unlock()
}
}()
}
// Feed domains into standardCh
go func() {
for _, dom := range domains {
// optionally, if maxPages is relevant, you can track how many have been processed
standardCh <- dom
}
// close the standardCh once all are queued
close(standardCh)
}()
// Wait for standard workers to finish, then close chromeCh
go func() {
wgStandard.Wait()
close(chromeCh)
}()
// Wait for chrome workers to finish
wgChrome.Wait()
return nil
}

8
go.mod
View file

@ -41,13 +41,21 @@ require (
github.com/blevesearch/zapx/v14 v14.3.10 // indirect
github.com/blevesearch/zapx/v15 v15.3.17 // indirect
github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b // indirect
github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb // indirect
github.com/chromedp/chromedp v0.11.2 // indirect
github.com/chromedp/sysutil v1.1.0 // indirect
github.com/go-ole/go-ole v1.3.0 // indirect
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f // indirect
github.com/golang/geo v0.0.0-20230421003525-6adc56603217 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/mschoch/smat v0.2.0 // indirect

17
go.sum
View file

@ -47,6 +47,12 @@ github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b h1:ju9Az5Y
github.com/blevesearch/zapx/v16 v16.1.9-0.20241217210638-a0519e7caf3b/go.mod h1:BlrYNpOu4BvVRslmIG+rLtKhmjIaRhIbG8sb9scGTwI=
github.com/chai2010/webp v1.1.1 h1:jTRmEccAJ4MGrhFOrPMpNGIJ/eybIgwKpcACsrTEapk=
github.com/chai2010/webp v1.1.1/go.mod h1:0XVwvZWdjjdxpUEIf7b9g9VkHFnInUSYujwqTLEuldU=
github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb h1:noKVm2SsG4v0Yd0lHNtFYc9EUxIVvrr4kJ6hM8wvIYU=
github.com/chromedp/cdproto v0.0.0-20241022234722-4d5d5faf59fb/go.mod h1:4XqMl3iIW08jtieURWL6Tt5924w21pxirC6th662XUM=
github.com/chromedp/chromedp v0.11.2 h1:ZRHTh7DjbNTlfIv3NFTbB7eVeu5XCNkgrpcGSpn2oX0=
github.com/chromedp/chromedp v0.11.2/go.mod h1:lr8dFRLKsdTTWb75C/Ttol2vnBKOSnt0BW8R9Xaupi8=
github.com/chromedp/sysutil v1.1.0 h1:PUFNv5EcprjqXZD9nJb9b/c9ibAbxiYo4exNWZyipwM=
github.com/chromedp/sysutil v1.1.0/go.mod h1:WiThHUdltqCNKGc4gaU50XgYjwjYIhKWoHGPTUfWTJ8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -57,6 +63,12 @@ github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c h1:wpkoddUomPfHiOziH
github.com/go-shiori/dom v0.0.0-20230515143342-73569d674e1c/go.mod h1:oVDCh3qjJMLVUSILBRwrm+Bc6RNXGZYtoh9xdvf1ffM=
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f h1:cypj7SJh+47G9J3VCPdMzT3uWcXWAWDJA54ErTfOigI=
github.com/go-shiori/go-readability v0.0.0-20241012063810-92284fa8a71f/go.mod h1:YWa00ashoPZMAOElrSn4E1cJErhDVU6PWAll4Hxzn+w=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
github.com/golang/geo v0.0.0-20230421003525-6adc56603217 h1:HKlyj6in2JV6wVkmQ4XmG/EIm+SCYlPZ+V4GWit7Z+I=
@ -68,10 +80,14 @@ github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/leonelquinteros/gotext v1.7.0 h1:jcJmF4AXqyamP7vuw2MMIKs+O3jAEmvrc5JQiI8Ht/8=
github.com/leonelquinteros/gotext v1.7.0/go.mod h1:qJdoQuERPpccw7L70uoU+K/BvTfRBHYsisCQyFLXyvw=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mattn/go-runewidth v0.0.10/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
@ -137,6 +153,7 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=