improved crawler data extraction (added chromedp)
This commit is contained in:
parent
3494457336
commit
c71808aa1e
6 changed files with 305 additions and 166 deletions
101
config.go
101
config.go
|
@ -23,43 +23,45 @@ type CacheConfig struct {
|
|||
}
|
||||
|
||||
type Config struct {
|
||||
Port int // Added
|
||||
AuthCode string // Added
|
||||
PeerID string // Added
|
||||
Peers []string
|
||||
Domain string // Added
|
||||
NodesEnabled bool // Added
|
||||
CrawlerEnabled bool // Added
|
||||
IndexerEnabled bool // Added
|
||||
WebsiteEnabled bool // Added
|
||||
RamCacheEnabled bool
|
||||
DriveCacheEnabled bool // Added
|
||||
LogLevel int // Added
|
||||
ConcurrentCrawlers int // Number of concurrent crawlers
|
||||
CrawlingInterval time.Duration // Refres crawled results in...
|
||||
MaxPagesPerDomain int // Max pages to crawl per domain
|
||||
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
|
||||
Port int // Added
|
||||
AuthCode string // Added
|
||||
PeerID string // Added
|
||||
Peers []string
|
||||
Domain string // Added
|
||||
NodesEnabled bool // Added
|
||||
CrawlerEnabled bool // Added
|
||||
IndexerEnabled bool // Added
|
||||
WebsiteEnabled bool // Added
|
||||
RamCacheEnabled bool
|
||||
DriveCacheEnabled bool // Added
|
||||
LogLevel int // Added
|
||||
ConcurrentStandardCrawlers int
|
||||
ConcurrentChromeCrawlers int
|
||||
CrawlingInterval time.Duration // Refres crawled results in...
|
||||
MaxPagesPerDomain int // Max pages to crawl per domain
|
||||
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
|
||||
|
||||
DriveCache CacheConfig
|
||||
RamCache CacheConfig
|
||||
}
|
||||
|
||||
var defaultConfig = Config{
|
||||
Port: 5000,
|
||||
Domain: "localhost",
|
||||
Peers: []string{},
|
||||
AuthCode: generateStrongRandomString(64),
|
||||
NodesEnabled: false,
|
||||
CrawlerEnabled: true,
|
||||
IndexerEnabled: false,
|
||||
WebsiteEnabled: true,
|
||||
RamCacheEnabled: true,
|
||||
DriveCacheEnabled: false,
|
||||
ConcurrentCrawlers: 5,
|
||||
CrawlingInterval: 24 * time.Hour,
|
||||
MaxPagesPerDomain: 10,
|
||||
IndexRefreshInterval: 2 * time.Minute,
|
||||
LogLevel: 1,
|
||||
Port: 5000,
|
||||
Domain: "localhost",
|
||||
Peers: []string{},
|
||||
AuthCode: generateStrongRandomString(64),
|
||||
NodesEnabled: false,
|
||||
CrawlerEnabled: true,
|
||||
IndexerEnabled: false,
|
||||
WebsiteEnabled: true,
|
||||
RamCacheEnabled: true,
|
||||
DriveCacheEnabled: false,
|
||||
ConcurrentStandardCrawlers: 12,
|
||||
ConcurrentChromeCrawlers: 4,
|
||||
CrawlingInterval: 24 * time.Hour,
|
||||
MaxPagesPerDomain: 10,
|
||||
IndexRefreshInterval: 2 * time.Minute,
|
||||
LogLevel: 1,
|
||||
DriveCache: CacheConfig{
|
||||
Duration: 48 * time.Hour, // Added
|
||||
Path: "./cache", // Added
|
||||
|
@ -249,7 +251,8 @@ func saveConfig(config Config) {
|
|||
|
||||
// Indexer section
|
||||
indexerSec := cfg.Section("Indexer")
|
||||
indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers))
|
||||
indexerSec.Key("ConcurrentStandardCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
|
||||
indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
|
||||
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
|
||||
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
|
||||
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
|
||||
|
@ -296,7 +299,8 @@ func loadConfig() Config {
|
|||
driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled)
|
||||
|
||||
// Indexing
|
||||
concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi)
|
||||
concurrentStandardCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentStandardCrawlers"), defaultConfig.ConcurrentStandardCrawlers, strconv.Atoi)
|
||||
concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi)
|
||||
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
|
||||
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
|
||||
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
|
||||
|
@ -315,21 +319,22 @@ func loadConfig() Config {
|
|||
ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes)))
|
||||
|
||||
return Config{
|
||||
Port: port,
|
||||
Domain: domain,
|
||||
LogLevel: logLevel,
|
||||
AuthCode: authCode,
|
||||
Peers: peers,
|
||||
NodesEnabled: nodesEnabled,
|
||||
CrawlerEnabled: crawlerEnabled,
|
||||
IndexerEnabled: indexerEnabled,
|
||||
WebsiteEnabled: websiteEnabled,
|
||||
RamCacheEnabled: ramCacheEnabled,
|
||||
DriveCacheEnabled: driveCacheEnabled,
|
||||
ConcurrentCrawlers: concurrentCrawlers,
|
||||
CrawlingInterval: crawlingInterval,
|
||||
MaxPagesPerDomain: maxPagesPerDomain,
|
||||
IndexRefreshInterval: indexRefreshInterval,
|
||||
Port: port,
|
||||
Domain: domain,
|
||||
LogLevel: logLevel,
|
||||
AuthCode: authCode,
|
||||
Peers: peers,
|
||||
NodesEnabled: nodesEnabled,
|
||||
CrawlerEnabled: crawlerEnabled,
|
||||
IndexerEnabled: indexerEnabled,
|
||||
WebsiteEnabled: websiteEnabled,
|
||||
RamCacheEnabled: ramCacheEnabled,
|
||||
DriveCacheEnabled: driveCacheEnabled,
|
||||
ConcurrentStandardCrawlers: concurrentStandardCrawlers,
|
||||
ConcurrentChromeCrawlers: concurrentChromeCrawlers,
|
||||
CrawlingInterval: crawlingInterval,
|
||||
MaxPagesPerDomain: maxPagesPerDomain,
|
||||
IndexRefreshInterval: indexRefreshInterval,
|
||||
DriveCache: CacheConfig{
|
||||
Duration: driveDuration,
|
||||
MaxUsageBytes: driveMaxUsage,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue