Indexing update #1
3 changed files with 153 additions and 94 deletions
157
config.go
157
config.go
|
@ -23,35 +23,43 @@ type CacheConfig struct {
|
|||
}
|
||||
|
||||
type Config struct {
|
||||
Port int // Added
|
||||
AuthCode string // Added
|
||||
PeerID string // Added
|
||||
Peers []string
|
||||
Domain string // Added
|
||||
NodesEnabled bool // Added
|
||||
CrawlerEnabled bool // Added
|
||||
IndexerEnabled bool // Added
|
||||
WebsiteEnabled bool // Added
|
||||
RamCacheEnabled bool
|
||||
DriveCacheEnabled bool // Added
|
||||
LogLevel int // Added
|
||||
Port int // Added
|
||||
AuthCode string // Added
|
||||
PeerID string // Added
|
||||
Peers []string
|
||||
Domain string // Added
|
||||
NodesEnabled bool // Added
|
||||
CrawlerEnabled bool // Added
|
||||
IndexerEnabled bool // Added
|
||||
WebsiteEnabled bool // Added
|
||||
RamCacheEnabled bool
|
||||
DriveCacheEnabled bool // Added
|
||||
LogLevel int // Added
|
||||
ConcurrentCrawlers int // Number of concurrent crawlers
|
||||
CrawlingInterval time.Duration // Refres crawled results in...
|
||||
MaxPagesPerDomain int // Max pages to crawl per domain
|
||||
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
|
||||
|
||||
DriveCache CacheConfig
|
||||
RamCache CacheConfig
|
||||
}
|
||||
|
||||
var defaultConfig = Config{
|
||||
Port: 5000,
|
||||
Domain: "localhost",
|
||||
Peers: []string{},
|
||||
AuthCode: generateStrongRandomString(64),
|
||||
NodesEnabled: false,
|
||||
CrawlerEnabled: true,
|
||||
IndexerEnabled: false,
|
||||
WebsiteEnabled: true,
|
||||
RamCacheEnabled: true,
|
||||
DriveCacheEnabled: false,
|
||||
LogLevel: 1,
|
||||
Port: 5000,
|
||||
Domain: "localhost",
|
||||
Peers: []string{},
|
||||
AuthCode: generateStrongRandomString(64),
|
||||
NodesEnabled: false,
|
||||
CrawlerEnabled: true,
|
||||
IndexerEnabled: false,
|
||||
WebsiteEnabled: true,
|
||||
RamCacheEnabled: true,
|
||||
DriveCacheEnabled: false,
|
||||
ConcurrentCrawlers: 5,
|
||||
CrawlingInterval: 24 * time.Hour,
|
||||
MaxPagesPerDomain: 10,
|
||||
IndexRefreshInterval: 2 * time.Minute,
|
||||
LogLevel: 1,
|
||||
DriveCache: CacheConfig{
|
||||
Duration: 48 * time.Hour, // Added
|
||||
Path: "./cache", // Added
|
||||
|
@ -238,8 +246,13 @@ func saveConfig(config Config) {
|
|||
featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled))
|
||||
featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled))
|
||||
featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled))
|
||||
featuresSec.Key("RamCache").SetValue(strconv.FormatBool(config.RamCacheEnabled))
|
||||
featuresSec.Key("DriveCache").SetValue(strconv.FormatBool(config.DriveCacheEnabled))
|
||||
|
||||
// Indexer section
|
||||
indexerSec := cfg.Section("Indexer")
|
||||
indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers))
|
||||
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
|
||||
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
|
||||
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
|
||||
|
||||
// DriveCache section
|
||||
driveSec := cfg.Section("DriveCache")
|
||||
|
@ -266,53 +279,61 @@ func loadConfig() Config {
|
|||
}
|
||||
|
||||
// Server
|
||||
port, _ := cfg.Section("Server").Key("Port").Int()
|
||||
domain := cfg.Section("Server").Key("Domain").String()
|
||||
logLevel, _ := cfg.Section("Server").Key("LogLevel").Int()
|
||||
port := getConfigValue(cfg.Section("Server").Key("Port"), defaultConfig.Port, strconv.Atoi)
|
||||
domain := getConfigValueString(cfg.Section("Server").Key("Domain"), defaultConfig.Domain)
|
||||
logLevel := getConfigValue(cfg.Section("Server").Key("LogLevel"), defaultConfig.LogLevel, strconv.Atoi)
|
||||
|
||||
// Peers
|
||||
authCode := cfg.Section("Peers").Key("AuthCode").String()
|
||||
peersStr := cfg.Section("Peers").Key("Peers").String()
|
||||
peers := strings.Split(peersStr, ",")
|
||||
authCode := getConfigValueString(cfg.Section("Peers").Key("AuthCode"), defaultConfig.AuthCode)
|
||||
peers := strings.Split(getConfigValueString(cfg.Section("Peers").Key("Peers"), ""), ",")
|
||||
|
||||
// Features
|
||||
nodesEnabled, _ := cfg.Section("Features").Key("Nodes").Bool()
|
||||
crawlerEnabled, _ := cfg.Section("Features").Key("Crawler").Bool()
|
||||
indexerEnabled, _ := cfg.Section("Features").Key("Indexer").Bool()
|
||||
websiteEnabled, _ := cfg.Section("Features").Key("Website").Bool()
|
||||
ramCacheEnabled, _ := cfg.Section("Features").Key("RamCache").Bool()
|
||||
driveCacheEnabled, _ := cfg.Section("Features").Key("DriveCache").Bool()
|
||||
nodesEnabled := getConfigValueBool(cfg.Section("Features").Key("Nodes"), defaultConfig.NodesEnabled)
|
||||
crawlerEnabled := getConfigValueBool(cfg.Section("Features").Key("Crawler"), defaultConfig.CrawlerEnabled)
|
||||
indexerEnabled := getConfigValueBool(cfg.Section("Features").Key("Indexer"), defaultConfig.IndexerEnabled)
|
||||
websiteEnabled := getConfigValueBool(cfg.Section("Features").Key("Website"), defaultConfig.WebsiteEnabled)
|
||||
ramCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("RamCache"), defaultConfig.RamCacheEnabled)
|
||||
driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled)
|
||||
|
||||
// Indexing
|
||||
concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi)
|
||||
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
|
||||
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
|
||||
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
|
||||
|
||||
// DriveCache
|
||||
driveDuration, _ := time.ParseDuration(cfg.Section("DriveCache").Key("Duration").String())
|
||||
drivePath := cfg.Section("DriveCache").Key("Path").String()
|
||||
driveMaxUsage := parseMaxUsageDrive(cfg.Section("DriveCache").Key("MaxUsage").String(), drivePath)
|
||||
driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration)
|
||||
drivePath := getConfigValueString(cfg.Section("DriveCache").Key("Path"), defaultConfig.DriveCache.Path)
|
||||
driveMaxUsage := parseMaxUsageDrive(getConfigValueString(cfg.Section("DriveCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.DriveCache.MaxUsageBytes)), drivePath)
|
||||
// maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int()
|
||||
// if maxConcurrentDownloads == 0 {
|
||||
// maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads
|
||||
// }
|
||||
|
||||
// RamCache
|
||||
ramDuration, _ := time.ParseDuration(cfg.Section("RamCache").Key("Duration").String())
|
||||
ramMaxUsage := parseMaxUsageRam(cfg.Section("RamCache").Key("MaxUsage").String())
|
||||
ramDuration := getConfigValue(cfg.Section("RamCache").Key("Duration"), defaultConfig.RamCache.Duration, time.ParseDuration)
|
||||
ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes)))
|
||||
|
||||
return Config{
|
||||
Port: port,
|
||||
Domain: domain,
|
||||
LogLevel: logLevel,
|
||||
AuthCode: authCode,
|
||||
Peers: peers,
|
||||
NodesEnabled: nodesEnabled,
|
||||
CrawlerEnabled: crawlerEnabled,
|
||||
IndexerEnabled: indexerEnabled,
|
||||
WebsiteEnabled: websiteEnabled,
|
||||
RamCacheEnabled: ramCacheEnabled,
|
||||
DriveCacheEnabled: driveCacheEnabled,
|
||||
Port: port,
|
||||
Domain: domain,
|
||||
LogLevel: logLevel,
|
||||
AuthCode: authCode,
|
||||
Peers: peers,
|
||||
NodesEnabled: nodesEnabled,
|
||||
CrawlerEnabled: crawlerEnabled,
|
||||
IndexerEnabled: indexerEnabled,
|
||||
WebsiteEnabled: websiteEnabled,
|
||||
RamCacheEnabled: ramCacheEnabled,
|
||||
DriveCacheEnabled: driveCacheEnabled,
|
||||
ConcurrentCrawlers: concurrentCrawlers,
|
||||
CrawlingInterval: crawlingInterval,
|
||||
MaxPagesPerDomain: maxPagesPerDomain,
|
||||
IndexRefreshInterval: indexRefreshInterval,
|
||||
DriveCache: CacheConfig{
|
||||
Duration: driveDuration,
|
||||
MaxUsageBytes: driveMaxUsage,
|
||||
Path: drivePath,
|
||||
// MaxConcurrentThumbnailDownloads: maxConcurrentDownloads,
|
||||
},
|
||||
RamCache: CacheConfig{
|
||||
Duration: ramDuration,
|
||||
|
@ -321,6 +342,34 @@ func loadConfig() Config {
|
|||
}
|
||||
}
|
||||
|
||||
// getConfigValue retrieves a configuration value or returns a default value from defaultConfig.
|
||||
func getConfigValue[T any](key *ini.Key, defaultValue T, parseFunc func(string) (T, error)) T {
|
||||
if key == nil || key.String() == "" {
|
||||
return defaultValue
|
||||
}
|
||||
value, err := parseFunc(key.String())
|
||||
if err != nil {
|
||||
return defaultValue
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
// getConfigValueString retrieves a string value or falls back to the default.
|
||||
func getConfigValueString(key *ini.Key, defaultValue string) string {
|
||||
if key == nil || key.String() == "" {
|
||||
return defaultValue
|
||||
}
|
||||
return key.String()
|
||||
}
|
||||
|
||||
// getConfigValueBool retrieves a boolean value or falls back to the default.
|
||||
func getConfigValueBool(key *ini.Key, defaultValue bool) bool {
|
||||
if key == nil || key.String() == "" {
|
||||
return defaultValue
|
||||
}
|
||||
return key.MustBool(defaultValue)
|
||||
}
|
||||
|
||||
// Helper to parse MaxUsage string into bytes
|
||||
func parseMaxUsageRam(value string) uint64 {
|
||||
const GiB = 1024 * 1024 * 1024
|
||||
|
|
84
crawler.go
84
crawler.go
|
@ -7,6 +7,7 @@ import (
|
|||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
|
@ -18,8 +19,8 @@ func webCrawlerInit() {
|
|||
// First run immediately
|
||||
runCrawlerAndIndexer()
|
||||
|
||||
// Then every 24h (adjust as needed)
|
||||
ticker := time.NewTicker(24 * time.Hour)
|
||||
// Then run periodically based on CrawlingInterval
|
||||
ticker := time.NewTicker(config.CrawlingInterval)
|
||||
for range ticker.C {
|
||||
runCrawlerAndIndexer()
|
||||
}
|
||||
|
@ -37,16 +38,13 @@ func runCrawlerAndIndexer() {
|
|||
|
||||
// 2. Crawl each domain and write results to data_to_index.txt
|
||||
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
||||
if err := crawlDomainsToFile(domains, outFile); err != nil {
|
||||
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
|
||||
printErr("Error crawling domains: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// 3. Re-index data_to_index.txt
|
||||
if err := IndexFile(outFile); err != nil {
|
||||
printErr("Error indexing data_to_index.txt: %v", err)
|
||||
return
|
||||
}
|
||||
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
|
||||
startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
||||
|
||||
printDebug("Crawl + index refresh completed.")
|
||||
}
|
||||
|
@ -81,10 +79,11 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
|
|||
}
|
||||
|
||||
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
||||
// Read existing data_to_index.txt into a map to prevent duplicates
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
|
||||
existingEntries := make(map[string]bool)
|
||||
if _, err := os.Stat(outFile); err == nil { // File exists
|
||||
var mu sync.Mutex // Mutex to protect access to the map
|
||||
|
||||
if _, err := os.Stat(outFile); err == nil {
|
||||
file, err := os.Open(outFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to open %s: %v", outFile, err)
|
||||
|
@ -96,7 +95,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
|||
line := scanner.Text()
|
||||
parts := strings.SplitN(line, "|", 5)
|
||||
if len(parts) >= 1 {
|
||||
existingEntries[parts[0]] = true // Mark existing domain
|
||||
existingEntries[parts[0]] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -108,37 +107,48 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
|||
}
|
||||
defer file.Close()
|
||||
|
||||
semaphore := make(chan struct{}, concurrentCrawlers)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for _, d := range domains {
|
||||
rank := d[0]
|
||||
domain := d[1]
|
||||
if domain == "" || existingEntries["https://"+domain] {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
semaphore <- struct{}{}
|
||||
go func(domain [2]string) {
|
||||
defer wg.Done()
|
||||
defer func() { <-semaphore }()
|
||||
|
||||
fullURL := "https://" + domain
|
||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||
if title == "" {
|
||||
title = "Unknown Title"
|
||||
}
|
||||
if desc == "" {
|
||||
desc = "No Description"
|
||||
}
|
||||
rank := domain[0]
|
||||
domainName := domain[1]
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// Write unique domain to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL,
|
||||
sanitize(title),
|
||||
sanitize(keywords),
|
||||
sanitize(desc),
|
||||
rank,
|
||||
)
|
||||
if _, err := file.WriteString(line); err != nil {
|
||||
return err
|
||||
}
|
||||
mu.Lock()
|
||||
if domainName == "" || existingEntries[fullURL] {
|
||||
mu.Unlock()
|
||||
return
|
||||
}
|
||||
existingEntries[fullURL] = true
|
||||
mu.Unlock()
|
||||
|
||||
existingEntries[fullURL] = true
|
||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||
if title == "" {
|
||||
title = "Unknown Title"
|
||||
}
|
||||
if desc == "" {
|
||||
desc = "No Description"
|
||||
}
|
||||
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL,
|
||||
sanitize(title),
|
||||
sanitize(keywords),
|
||||
sanitize(desc),
|
||||
rank,
|
||||
)
|
||||
file.WriteString(line)
|
||||
}(d)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -28,12 +28,12 @@ var (
|
|||
bleveIndex bleve.Index
|
||||
)
|
||||
|
||||
// startPeriodicIndexing refreshes the index from a file periodically
|
||||
func startPeriodicIndexing(filePath string, interval time.Duration) {
|
||||
go func() {
|
||||
for {
|
||||
printDebug("Refreshing index from %s", filePath)
|
||||
err := IndexFile(filePath)
|
||||
if err != nil {
|
||||
if err := IndexFile(filePath); err != nil {
|
||||
printErr("Failed to refresh index: %v", err)
|
||||
}
|
||||
time.Sleep(interval)
|
||||
|
@ -139,7 +139,7 @@ func IndexFile(filePath string) error {
|
|||
return fmt.Errorf("error reading file: %v", err)
|
||||
}
|
||||
|
||||
printDebug("Indexed %d unique domains from %s\n", len(indexedDomains), filePath)
|
||||
printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue