diff --git a/config.go b/config.go index 2e5d805..4ea4eb2 100644 --- a/config.go +++ b/config.go @@ -23,35 +23,43 @@ type CacheConfig struct { } type Config struct { - Port int // Added - AuthCode string // Added - PeerID string // Added - Peers []string - Domain string // Added - NodesEnabled bool // Added - CrawlerEnabled bool // Added - IndexerEnabled bool // Added - WebsiteEnabled bool // Added - RamCacheEnabled bool - DriveCacheEnabled bool // Added - LogLevel int // Added + Port int // Added + AuthCode string // Added + PeerID string // Added + Peers []string + Domain string // Added + NodesEnabled bool // Added + CrawlerEnabled bool // Added + IndexerEnabled bool // Added + WebsiteEnabled bool // Added + RamCacheEnabled bool + DriveCacheEnabled bool // Added + LogLevel int // Added + ConcurrentCrawlers int // Number of concurrent crawlers + CrawlingInterval time.Duration // Refres crawled results in... + MaxPagesPerDomain int // Max pages to crawl per domain + IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") DriveCache CacheConfig RamCache CacheConfig } var defaultConfig = Config{ - Port: 5000, - Domain: "localhost", - Peers: []string{}, - AuthCode: generateStrongRandomString(64), - NodesEnabled: false, - CrawlerEnabled: true, - IndexerEnabled: false, - WebsiteEnabled: true, - RamCacheEnabled: true, - DriveCacheEnabled: false, - LogLevel: 1, + Port: 5000, + Domain: "localhost", + Peers: []string{}, + AuthCode: generateStrongRandomString(64), + NodesEnabled: false, + CrawlerEnabled: true, + IndexerEnabled: false, + WebsiteEnabled: true, + RamCacheEnabled: true, + DriveCacheEnabled: false, + ConcurrentCrawlers: 5, + CrawlingInterval: 24 * time.Hour, + MaxPagesPerDomain: 10, + IndexRefreshInterval: 2 * time.Minute, + LogLevel: 1, DriveCache: CacheConfig{ Duration: 48 * time.Hour, // Added Path: "./cache", // Added @@ -238,8 +246,13 @@ func saveConfig(config Config) { featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled)) featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled)) featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled)) - featuresSec.Key("RamCache").SetValue(strconv.FormatBool(config.RamCacheEnabled)) - featuresSec.Key("DriveCache").SetValue(strconv.FormatBool(config.DriveCacheEnabled)) + + // Indexer section + indexerSec := cfg.Section("Indexer") + indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers)) + indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) + indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) + indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String()) // DriveCache section driveSec := cfg.Section("DriveCache") @@ -266,53 +279,61 @@ func loadConfig() Config { } // Server - port, _ := cfg.Section("Server").Key("Port").Int() - domain := cfg.Section("Server").Key("Domain").String() - logLevel, _ := cfg.Section("Server").Key("LogLevel").Int() + port := getConfigValue(cfg.Section("Server").Key("Port"), defaultConfig.Port, strconv.Atoi) + domain := getConfigValueString(cfg.Section("Server").Key("Domain"), defaultConfig.Domain) + logLevel := getConfigValue(cfg.Section("Server").Key("LogLevel"), defaultConfig.LogLevel, strconv.Atoi) // Peers - authCode := cfg.Section("Peers").Key("AuthCode").String() - peersStr := cfg.Section("Peers").Key("Peers").String() - peers := strings.Split(peersStr, ",") + authCode := getConfigValueString(cfg.Section("Peers").Key("AuthCode"), defaultConfig.AuthCode) + peers := strings.Split(getConfigValueString(cfg.Section("Peers").Key("Peers"), ""), ",") // Features - nodesEnabled, _ := cfg.Section("Features").Key("Nodes").Bool() - crawlerEnabled, _ := cfg.Section("Features").Key("Crawler").Bool() - indexerEnabled, _ := cfg.Section("Features").Key("Indexer").Bool() - websiteEnabled, _ := cfg.Section("Features").Key("Website").Bool() - ramCacheEnabled, _ := cfg.Section("Features").Key("RamCache").Bool() - driveCacheEnabled, _ := cfg.Section("Features").Key("DriveCache").Bool() + nodesEnabled := getConfigValueBool(cfg.Section("Features").Key("Nodes"), defaultConfig.NodesEnabled) + crawlerEnabled := getConfigValueBool(cfg.Section("Features").Key("Crawler"), defaultConfig.CrawlerEnabled) + indexerEnabled := getConfigValueBool(cfg.Section("Features").Key("Indexer"), defaultConfig.IndexerEnabled) + websiteEnabled := getConfigValueBool(cfg.Section("Features").Key("Website"), defaultConfig.WebsiteEnabled) + ramCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("RamCache"), defaultConfig.RamCacheEnabled) + driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled) + + // Indexing + concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi) + crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) + maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) + indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration) // DriveCache - driveDuration, _ := time.ParseDuration(cfg.Section("DriveCache").Key("Duration").String()) - drivePath := cfg.Section("DriveCache").Key("Path").String() - driveMaxUsage := parseMaxUsageDrive(cfg.Section("DriveCache").Key("MaxUsage").String(), drivePath) + driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration) + drivePath := getConfigValueString(cfg.Section("DriveCache").Key("Path"), defaultConfig.DriveCache.Path) + driveMaxUsage := parseMaxUsageDrive(getConfigValueString(cfg.Section("DriveCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.DriveCache.MaxUsageBytes)), drivePath) // maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int() // if maxConcurrentDownloads == 0 { // maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads // } // RamCache - ramDuration, _ := time.ParseDuration(cfg.Section("RamCache").Key("Duration").String()) - ramMaxUsage := parseMaxUsageRam(cfg.Section("RamCache").Key("MaxUsage").String()) + ramDuration := getConfigValue(cfg.Section("RamCache").Key("Duration"), defaultConfig.RamCache.Duration, time.ParseDuration) + ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes))) return Config{ - Port: port, - Domain: domain, - LogLevel: logLevel, - AuthCode: authCode, - Peers: peers, - NodesEnabled: nodesEnabled, - CrawlerEnabled: crawlerEnabled, - IndexerEnabled: indexerEnabled, - WebsiteEnabled: websiteEnabled, - RamCacheEnabled: ramCacheEnabled, - DriveCacheEnabled: driveCacheEnabled, + Port: port, + Domain: domain, + LogLevel: logLevel, + AuthCode: authCode, + Peers: peers, + NodesEnabled: nodesEnabled, + CrawlerEnabled: crawlerEnabled, + IndexerEnabled: indexerEnabled, + WebsiteEnabled: websiteEnabled, + RamCacheEnabled: ramCacheEnabled, + DriveCacheEnabled: driveCacheEnabled, + ConcurrentCrawlers: concurrentCrawlers, + CrawlingInterval: crawlingInterval, + MaxPagesPerDomain: maxPagesPerDomain, + IndexRefreshInterval: indexRefreshInterval, DriveCache: CacheConfig{ Duration: driveDuration, MaxUsageBytes: driveMaxUsage, Path: drivePath, - // MaxConcurrentThumbnailDownloads: maxConcurrentDownloads, }, RamCache: CacheConfig{ Duration: ramDuration, @@ -321,6 +342,34 @@ func loadConfig() Config { } } +// getConfigValue retrieves a configuration value or returns a default value from defaultConfig. +func getConfigValue[T any](key *ini.Key, defaultValue T, parseFunc func(string) (T, error)) T { + if key == nil || key.String() == "" { + return defaultValue + } + value, err := parseFunc(key.String()) + if err != nil { + return defaultValue + } + return value +} + +// getConfigValueString retrieves a string value or falls back to the default. +func getConfigValueString(key *ini.Key, defaultValue string) string { + if key == nil || key.String() == "" { + return defaultValue + } + return key.String() +} + +// getConfigValueBool retrieves a boolean value or falls back to the default. +func getConfigValueBool(key *ini.Key, defaultValue bool) bool { + if key == nil || key.String() == "" { + return defaultValue + } + return key.MustBool(defaultValue) +} + // Helper to parse MaxUsage string into bytes func parseMaxUsageRam(value string) uint64 { const GiB = 1024 * 1024 * 1024 diff --git a/crawler.go b/crawler.go index bbe3540..fbb5b5e 100644 --- a/crawler.go +++ b/crawler.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "strings" + "sync" "time" "golang.org/x/net/html" @@ -18,8 +19,8 @@ func webCrawlerInit() { // First run immediately runCrawlerAndIndexer() - // Then every 24h (adjust as needed) - ticker := time.NewTicker(24 * time.Hour) + // Then run periodically based on CrawlingInterval + ticker := time.NewTicker(config.CrawlingInterval) for range ticker.C { runCrawlerAndIndexer() } @@ -37,16 +38,13 @@ func runCrawlerAndIndexer() { // 2. Crawl each domain and write results to data_to_index.txt outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") - if err := crawlDomainsToFile(domains, outFile); err != nil { + if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil { printErr("Error crawling domains: %v", err) return } - // 3. Re-index data_to_index.txt - if err := IndexFile(outFile); err != nil { - printErr("Error indexing data_to_index.txt: %v", err) - return - } + // 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval + startPeriodicIndexing(outFile, config.IndexRefreshInterval) printDebug("Crawl + index refresh completed.") } @@ -81,10 +79,11 @@ func readDomainsCSV(csvPath string) ([][2]string, error) { } // crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile -func crawlDomainsToFile(domains [][2]string, outFile string) error { - // Read existing data_to_index.txt into a map to prevent duplicates +func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error { existingEntries := make(map[string]bool) - if _, err := os.Stat(outFile); err == nil { // File exists + var mu sync.Mutex // Mutex to protect access to the map + + if _, err := os.Stat(outFile); err == nil { file, err := os.Open(outFile) if err != nil { return fmt.Errorf("unable to open %s: %v", outFile, err) @@ -96,7 +95,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error { line := scanner.Text() parts := strings.SplitN(line, "|", 5) if len(parts) >= 1 { - existingEntries[parts[0]] = true // Mark existing domain + existingEntries[parts[0]] = true } } } @@ -108,37 +107,48 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error { } defer file.Close() + semaphore := make(chan struct{}, concurrentCrawlers) + var wg sync.WaitGroup + for _, d := range domains { - rank := d[0] - domain := d[1] - if domain == "" || existingEntries["https://"+domain] { - continue - } + wg.Add(1) + semaphore <- struct{}{} + go func(domain [2]string) { + defer wg.Done() + defer func() { <-semaphore }() - fullURL := "https://" + domain - title, desc, keywords := fetchPageMetadata(fullURL) - if title == "" { - title = "Unknown Title" - } - if desc == "" { - desc = "No Description" - } + rank := domain[0] + domainName := domain[1] + fullURL := "https://" + domainName - // Write unique domain to file - line := fmt.Sprintf("%s|%s|%s|%s|%s\n", - fullURL, - sanitize(title), - sanitize(keywords), - sanitize(desc), - rank, - ) - if _, err := file.WriteString(line); err != nil { - return err - } + mu.Lock() + if domainName == "" || existingEntries[fullURL] { + mu.Unlock() + return + } + existingEntries[fullURL] = true + mu.Unlock() - existingEntries[fullURL] = true + title, desc, keywords := fetchPageMetadata(fullURL) + if title == "" { + title = "Unknown Title" + } + if desc == "" { + desc = "No Description" + } + + line := fmt.Sprintf("%s|%s|%s|%s|%s\n", + fullURL, + sanitize(title), + sanitize(keywords), + sanitize(desc), + rank, + ) + file.WriteString(line) + }(d) } + wg.Wait() return nil } diff --git a/indexer.go b/indexer.go index 66bc100..7963fc1 100644 --- a/indexer.go +++ b/indexer.go @@ -28,12 +28,12 @@ var ( bleveIndex bleve.Index ) +// startPeriodicIndexing refreshes the index from a file periodically func startPeriodicIndexing(filePath string, interval time.Duration) { go func() { for { printDebug("Refreshing index from %s", filePath) - err := IndexFile(filePath) - if err != nil { + if err := IndexFile(filePath); err != nil { printErr("Failed to refresh index: %v", err) } time.Sleep(interval) @@ -139,7 +139,7 @@ func IndexFile(filePath string) error { return fmt.Errorf("error reading file: %v", err) } - printDebug("Indexed %d unique domains from %s\n", len(indexedDomains), filePath) + printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath) return nil }