improved crawler data extraction (added chromedp)

This commit is contained in:
partisan 2025-01-01 14:50:12 +01:00
parent 3494457336
commit c71808aa1e
6 changed files with 305 additions and 166 deletions

101
config.go
View file

@ -23,43 +23,45 @@ type CacheConfig struct {
}
type Config struct {
Port int // Added
AuthCode string // Added
PeerID string // Added
Peers []string
Domain string // Added
NodesEnabled bool // Added
CrawlerEnabled bool // Added
IndexerEnabled bool // Added
WebsiteEnabled bool // Added
RamCacheEnabled bool
DriveCacheEnabled bool // Added
LogLevel int // Added
ConcurrentCrawlers int // Number of concurrent crawlers
CrawlingInterval time.Duration // Refres crawled results in...
MaxPagesPerDomain int // Max pages to crawl per domain
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
Port int // Added
AuthCode string // Added
PeerID string // Added
Peers []string
Domain string // Added
NodesEnabled bool // Added
CrawlerEnabled bool // Added
IndexerEnabled bool // Added
WebsiteEnabled bool // Added
RamCacheEnabled bool
DriveCacheEnabled bool // Added
LogLevel int // Added
ConcurrentStandardCrawlers int
ConcurrentChromeCrawlers int
CrawlingInterval time.Duration // Refres crawled results in...
MaxPagesPerDomain int // Max pages to crawl per domain
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
DriveCache CacheConfig
RamCache CacheConfig
}
var defaultConfig = Config{
Port: 5000,
Domain: "localhost",
Peers: []string{},
AuthCode: generateStrongRandomString(64),
NodesEnabled: false,
CrawlerEnabled: true,
IndexerEnabled: false,
WebsiteEnabled: true,
RamCacheEnabled: true,
DriveCacheEnabled: false,
ConcurrentCrawlers: 5,
CrawlingInterval: 24 * time.Hour,
MaxPagesPerDomain: 10,
IndexRefreshInterval: 2 * time.Minute,
LogLevel: 1,
Port: 5000,
Domain: "localhost",
Peers: []string{},
AuthCode: generateStrongRandomString(64),
NodesEnabled: false,
CrawlerEnabled: true,
IndexerEnabled: false,
WebsiteEnabled: true,
RamCacheEnabled: true,
DriveCacheEnabled: false,
ConcurrentStandardCrawlers: 12,
ConcurrentChromeCrawlers: 4,
CrawlingInterval: 24 * time.Hour,
MaxPagesPerDomain: 10,
IndexRefreshInterval: 2 * time.Minute,
LogLevel: 1,
DriveCache: CacheConfig{
Duration: 48 * time.Hour, // Added
Path: "./cache", // Added
@ -249,7 +251,8 @@ func saveConfig(config Config) {
// Indexer section
indexerSec := cfg.Section("Indexer")
indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers))
indexerSec.Key("ConcurrentStandardCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
@ -296,7 +299,8 @@ func loadConfig() Config {
driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled)
// Indexing
concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi)
concurrentStandardCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentStandardCrawlers"), defaultConfig.ConcurrentStandardCrawlers, strconv.Atoi)
concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi)
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
@ -315,21 +319,22 @@ func loadConfig() Config {
ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes)))
return Config{
Port: port,
Domain: domain,
LogLevel: logLevel,
AuthCode: authCode,
Peers: peers,
NodesEnabled: nodesEnabled,
CrawlerEnabled: crawlerEnabled,
IndexerEnabled: indexerEnabled,
WebsiteEnabled: websiteEnabled,
RamCacheEnabled: ramCacheEnabled,
DriveCacheEnabled: driveCacheEnabled,
ConcurrentCrawlers: concurrentCrawlers,
CrawlingInterval: crawlingInterval,
MaxPagesPerDomain: maxPagesPerDomain,
IndexRefreshInterval: indexRefreshInterval,
Port: port,
Domain: domain,
LogLevel: logLevel,
AuthCode: authCode,
Peers: peers,
NodesEnabled: nodesEnabled,
CrawlerEnabled: crawlerEnabled,
IndexerEnabled: indexerEnabled,
WebsiteEnabled: websiteEnabled,
RamCacheEnabled: ramCacheEnabled,
DriveCacheEnabled: driveCacheEnabled,
ConcurrentStandardCrawlers: concurrentStandardCrawlers,
ConcurrentChromeCrawlers: concurrentChromeCrawlers,
CrawlingInterval: crawlingInterval,
MaxPagesPerDomain: maxPagesPerDomain,
IndexRefreshInterval: indexRefreshInterval,
DriveCache: CacheConfig{
Duration: driveDuration,
MaxUsageBytes: driveMaxUsage,