Indexing update #1

Merged
partisan merged 9 commits from indexing into main 2025-01-05 21:17:34 +00:00
3 changed files with 153 additions and 94 deletions
Showing only changes of commit 13e1d6119b - Show all commits

157
config.go
View file

@ -23,35 +23,43 @@ type CacheConfig struct {
} }
type Config struct { type Config struct {
Port int // Added Port int // Added
AuthCode string // Added AuthCode string // Added
PeerID string // Added PeerID string // Added
Peers []string Peers []string
Domain string // Added Domain string // Added
NodesEnabled bool // Added NodesEnabled bool // Added
CrawlerEnabled bool // Added CrawlerEnabled bool // Added
IndexerEnabled bool // Added IndexerEnabled bool // Added
WebsiteEnabled bool // Added WebsiteEnabled bool // Added
RamCacheEnabled bool RamCacheEnabled bool
DriveCacheEnabled bool // Added DriveCacheEnabled bool // Added
LogLevel int // Added LogLevel int // Added
ConcurrentCrawlers int // Number of concurrent crawlers
CrawlingInterval time.Duration // Refres crawled results in...
MaxPagesPerDomain int // Max pages to crawl per domain
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
DriveCache CacheConfig DriveCache CacheConfig
RamCache CacheConfig RamCache CacheConfig
} }
var defaultConfig = Config{ var defaultConfig = Config{
Port: 5000, Port: 5000,
Domain: "localhost", Domain: "localhost",
Peers: []string{}, Peers: []string{},
AuthCode: generateStrongRandomString(64), AuthCode: generateStrongRandomString(64),
NodesEnabled: false, NodesEnabled: false,
CrawlerEnabled: true, CrawlerEnabled: true,
IndexerEnabled: false, IndexerEnabled: false,
WebsiteEnabled: true, WebsiteEnabled: true,
RamCacheEnabled: true, RamCacheEnabled: true,
DriveCacheEnabled: false, DriveCacheEnabled: false,
LogLevel: 1, ConcurrentCrawlers: 5,
CrawlingInterval: 24 * time.Hour,
MaxPagesPerDomain: 10,
IndexRefreshInterval: 2 * time.Minute,
LogLevel: 1,
DriveCache: CacheConfig{ DriveCache: CacheConfig{
Duration: 48 * time.Hour, // Added Duration: 48 * time.Hour, // Added
Path: "./cache", // Added Path: "./cache", // Added
@ -238,8 +246,13 @@ func saveConfig(config Config) {
featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled)) featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled))
featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled)) featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled))
featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled)) featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled))
featuresSec.Key("RamCache").SetValue(strconv.FormatBool(config.RamCacheEnabled))
featuresSec.Key("DriveCache").SetValue(strconv.FormatBool(config.DriveCacheEnabled)) // Indexer section
indexerSec := cfg.Section("Indexer")
indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers))
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
// DriveCache section // DriveCache section
driveSec := cfg.Section("DriveCache") driveSec := cfg.Section("DriveCache")
@ -266,53 +279,61 @@ func loadConfig() Config {
} }
// Server // Server
port, _ := cfg.Section("Server").Key("Port").Int() port := getConfigValue(cfg.Section("Server").Key("Port"), defaultConfig.Port, strconv.Atoi)
domain := cfg.Section("Server").Key("Domain").String() domain := getConfigValueString(cfg.Section("Server").Key("Domain"), defaultConfig.Domain)
logLevel, _ := cfg.Section("Server").Key("LogLevel").Int() logLevel := getConfigValue(cfg.Section("Server").Key("LogLevel"), defaultConfig.LogLevel, strconv.Atoi)
// Peers // Peers
authCode := cfg.Section("Peers").Key("AuthCode").String() authCode := getConfigValueString(cfg.Section("Peers").Key("AuthCode"), defaultConfig.AuthCode)
peersStr := cfg.Section("Peers").Key("Peers").String() peers := strings.Split(getConfigValueString(cfg.Section("Peers").Key("Peers"), ""), ",")
peers := strings.Split(peersStr, ",")
// Features // Features
nodesEnabled, _ := cfg.Section("Features").Key("Nodes").Bool() nodesEnabled := getConfigValueBool(cfg.Section("Features").Key("Nodes"), defaultConfig.NodesEnabled)
crawlerEnabled, _ := cfg.Section("Features").Key("Crawler").Bool() crawlerEnabled := getConfigValueBool(cfg.Section("Features").Key("Crawler"), defaultConfig.CrawlerEnabled)
indexerEnabled, _ := cfg.Section("Features").Key("Indexer").Bool() indexerEnabled := getConfigValueBool(cfg.Section("Features").Key("Indexer"), defaultConfig.IndexerEnabled)
websiteEnabled, _ := cfg.Section("Features").Key("Website").Bool() websiteEnabled := getConfigValueBool(cfg.Section("Features").Key("Website"), defaultConfig.WebsiteEnabled)
ramCacheEnabled, _ := cfg.Section("Features").Key("RamCache").Bool() ramCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("RamCache"), defaultConfig.RamCacheEnabled)
driveCacheEnabled, _ := cfg.Section("Features").Key("DriveCache").Bool() driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled)
// Indexing
concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi)
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
// DriveCache // DriveCache
driveDuration, _ := time.ParseDuration(cfg.Section("DriveCache").Key("Duration").String()) driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration)
drivePath := cfg.Section("DriveCache").Key("Path").String() drivePath := getConfigValueString(cfg.Section("DriveCache").Key("Path"), defaultConfig.DriveCache.Path)
driveMaxUsage := parseMaxUsageDrive(cfg.Section("DriveCache").Key("MaxUsage").String(), drivePath) driveMaxUsage := parseMaxUsageDrive(getConfigValueString(cfg.Section("DriveCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.DriveCache.MaxUsageBytes)), drivePath)
// maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int() // maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int()
// if maxConcurrentDownloads == 0 { // if maxConcurrentDownloads == 0 {
// maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads // maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads
// } // }
// RamCache // RamCache
ramDuration, _ := time.ParseDuration(cfg.Section("RamCache").Key("Duration").String()) ramDuration := getConfigValue(cfg.Section("RamCache").Key("Duration"), defaultConfig.RamCache.Duration, time.ParseDuration)
ramMaxUsage := parseMaxUsageRam(cfg.Section("RamCache").Key("MaxUsage").String()) ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes)))
return Config{ return Config{
Port: port, Port: port,
Domain: domain, Domain: domain,
LogLevel: logLevel, LogLevel: logLevel,
AuthCode: authCode, AuthCode: authCode,
Peers: peers, Peers: peers,
NodesEnabled: nodesEnabled, NodesEnabled: nodesEnabled,
CrawlerEnabled: crawlerEnabled, CrawlerEnabled: crawlerEnabled,
IndexerEnabled: indexerEnabled, IndexerEnabled: indexerEnabled,
WebsiteEnabled: websiteEnabled, WebsiteEnabled: websiteEnabled,
RamCacheEnabled: ramCacheEnabled, RamCacheEnabled: ramCacheEnabled,
DriveCacheEnabled: driveCacheEnabled, DriveCacheEnabled: driveCacheEnabled,
ConcurrentCrawlers: concurrentCrawlers,
CrawlingInterval: crawlingInterval,
MaxPagesPerDomain: maxPagesPerDomain,
IndexRefreshInterval: indexRefreshInterval,
DriveCache: CacheConfig{ DriveCache: CacheConfig{
Duration: driveDuration, Duration: driveDuration,
MaxUsageBytes: driveMaxUsage, MaxUsageBytes: driveMaxUsage,
Path: drivePath, Path: drivePath,
// MaxConcurrentThumbnailDownloads: maxConcurrentDownloads,
}, },
RamCache: CacheConfig{ RamCache: CacheConfig{
Duration: ramDuration, Duration: ramDuration,
@ -321,6 +342,34 @@ func loadConfig() Config {
} }
} }
// getConfigValue retrieves a configuration value or returns a default value from defaultConfig.
func getConfigValue[T any](key *ini.Key, defaultValue T, parseFunc func(string) (T, error)) T {
if key == nil || key.String() == "" {
return defaultValue
}
value, err := parseFunc(key.String())
if err != nil {
return defaultValue
}
return value
}
// getConfigValueString retrieves a string value or falls back to the default.
func getConfigValueString(key *ini.Key, defaultValue string) string {
if key == nil || key.String() == "" {
return defaultValue
}
return key.String()
}
// getConfigValueBool retrieves a boolean value or falls back to the default.
func getConfigValueBool(key *ini.Key, defaultValue bool) bool {
if key == nil || key.String() == "" {
return defaultValue
}
return key.MustBool(defaultValue)
}
// Helper to parse MaxUsage string into bytes // Helper to parse MaxUsage string into bytes
func parseMaxUsageRam(value string) uint64 { func parseMaxUsageRam(value string) uint64 {
const GiB = 1024 * 1024 * 1024 const GiB = 1024 * 1024 * 1024

View file

@ -7,6 +7,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
"sync"
"time" "time"
"golang.org/x/net/html" "golang.org/x/net/html"
@ -18,8 +19,8 @@ func webCrawlerInit() {
// First run immediately // First run immediately
runCrawlerAndIndexer() runCrawlerAndIndexer()
// Then every 24h (adjust as needed) // Then run periodically based on CrawlingInterval
ticker := time.NewTicker(24 * time.Hour) ticker := time.NewTicker(config.CrawlingInterval)
for range ticker.C { for range ticker.C {
runCrawlerAndIndexer() runCrawlerAndIndexer()
} }
@ -37,16 +38,13 @@ func runCrawlerAndIndexer() {
// 2. Crawl each domain and write results to data_to_index.txt // 2. Crawl each domain and write results to data_to_index.txt
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
if err := crawlDomainsToFile(domains, outFile); err != nil { if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
printErr("Error crawling domains: %v", err) printErr("Error crawling domains: %v", err)
return return
} }
// 3. Re-index data_to_index.txt // 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
if err := IndexFile(outFile); err != nil { startPeriodicIndexing(outFile, config.IndexRefreshInterval)
printErr("Error indexing data_to_index.txt: %v", err)
return
}
printDebug("Crawl + index refresh completed.") printDebug("Crawl + index refresh completed.")
} }
@ -81,10 +79,11 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
} }
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile // crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
func crawlDomainsToFile(domains [][2]string, outFile string) error { func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
// Read existing data_to_index.txt into a map to prevent duplicates
existingEntries := make(map[string]bool) existingEntries := make(map[string]bool)
if _, err := os.Stat(outFile); err == nil { // File exists var mu sync.Mutex // Mutex to protect access to the map
if _, err := os.Stat(outFile); err == nil {
file, err := os.Open(outFile) file, err := os.Open(outFile)
if err != nil { if err != nil {
return fmt.Errorf("unable to open %s: %v", outFile, err) return fmt.Errorf("unable to open %s: %v", outFile, err)
@ -96,7 +95,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
line := scanner.Text() line := scanner.Text()
parts := strings.SplitN(line, "|", 5) parts := strings.SplitN(line, "|", 5)
if len(parts) >= 1 { if len(parts) >= 1 {
existingEntries[parts[0]] = true // Mark existing domain existingEntries[parts[0]] = true
} }
} }
} }
@ -108,37 +107,48 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
} }
defer file.Close() defer file.Close()
semaphore := make(chan struct{}, concurrentCrawlers)
var wg sync.WaitGroup
for _, d := range domains { for _, d := range domains {
rank := d[0] wg.Add(1)
domain := d[1] semaphore <- struct{}{}
if domain == "" || existingEntries["https://"+domain] { go func(domain [2]string) {
continue defer wg.Done()
} defer func() { <-semaphore }()
fullURL := "https://" + domain rank := domain[0]
title, desc, keywords := fetchPageMetadata(fullURL) domainName := domain[1]
if title == "" { fullURL := "https://" + domainName
title = "Unknown Title"
}
if desc == "" {
desc = "No Description"
}
// Write unique domain to file mu.Lock()
line := fmt.Sprintf("%s|%s|%s|%s|%s\n", if domainName == "" || existingEntries[fullURL] {
fullURL, mu.Unlock()
sanitize(title), return
sanitize(keywords), }
sanitize(desc), existingEntries[fullURL] = true
rank, mu.Unlock()
)
if _, err := file.WriteString(line); err != nil {
return err
}
existingEntries[fullURL] = true title, desc, keywords := fetchPageMetadata(fullURL)
if title == "" {
title = "Unknown Title"
}
if desc == "" {
desc = "No Description"
}
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
fullURL,
sanitize(title),
sanitize(keywords),
sanitize(desc),
rank,
)
file.WriteString(line)
}(d)
} }
wg.Wait()
return nil return nil
} }

View file

@ -28,12 +28,12 @@ var (
bleveIndex bleve.Index bleveIndex bleve.Index
) )
// startPeriodicIndexing refreshes the index from a file periodically
func startPeriodicIndexing(filePath string, interval time.Duration) { func startPeriodicIndexing(filePath string, interval time.Duration) {
go func() { go func() {
for { for {
printDebug("Refreshing index from %s", filePath) printDebug("Refreshing index from %s", filePath)
err := IndexFile(filePath) if err := IndexFile(filePath); err != nil {
if err != nil {
printErr("Failed to refresh index: %v", err) printErr("Failed to refresh index: %v", err)
} }
time.Sleep(interval) time.Sleep(interval)
@ -139,7 +139,7 @@ func IndexFile(filePath string) error {
return fmt.Errorf("error reading file: %v", err) return fmt.Errorf("error reading file: %v", err)
} }
printDebug("Indexed %d unique domains from %s\n", len(indexedDomains), filePath) printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath)
return nil return nil
} }