Indexing update #1
3 changed files with 153 additions and 94 deletions
157
config.go
157
config.go
|
@ -23,35 +23,43 @@ type CacheConfig struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
Port int // Added
|
Port int // Added
|
||||||
AuthCode string // Added
|
AuthCode string // Added
|
||||||
PeerID string // Added
|
PeerID string // Added
|
||||||
Peers []string
|
Peers []string
|
||||||
Domain string // Added
|
Domain string // Added
|
||||||
NodesEnabled bool // Added
|
NodesEnabled bool // Added
|
||||||
CrawlerEnabled bool // Added
|
CrawlerEnabled bool // Added
|
||||||
IndexerEnabled bool // Added
|
IndexerEnabled bool // Added
|
||||||
WebsiteEnabled bool // Added
|
WebsiteEnabled bool // Added
|
||||||
RamCacheEnabled bool
|
RamCacheEnabled bool
|
||||||
DriveCacheEnabled bool // Added
|
DriveCacheEnabled bool // Added
|
||||||
LogLevel int // Added
|
LogLevel int // Added
|
||||||
|
ConcurrentCrawlers int // Number of concurrent crawlers
|
||||||
|
CrawlingInterval time.Duration // Refres crawled results in...
|
||||||
|
MaxPagesPerDomain int // Max pages to crawl per domain
|
||||||
|
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
|
||||||
|
|
||||||
DriveCache CacheConfig
|
DriveCache CacheConfig
|
||||||
RamCache CacheConfig
|
RamCache CacheConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
var defaultConfig = Config{
|
var defaultConfig = Config{
|
||||||
Port: 5000,
|
Port: 5000,
|
||||||
Domain: "localhost",
|
Domain: "localhost",
|
||||||
Peers: []string{},
|
Peers: []string{},
|
||||||
AuthCode: generateStrongRandomString(64),
|
AuthCode: generateStrongRandomString(64),
|
||||||
NodesEnabled: false,
|
NodesEnabled: false,
|
||||||
CrawlerEnabled: true,
|
CrawlerEnabled: true,
|
||||||
IndexerEnabled: false,
|
IndexerEnabled: false,
|
||||||
WebsiteEnabled: true,
|
WebsiteEnabled: true,
|
||||||
RamCacheEnabled: true,
|
RamCacheEnabled: true,
|
||||||
DriveCacheEnabled: false,
|
DriveCacheEnabled: false,
|
||||||
LogLevel: 1,
|
ConcurrentCrawlers: 5,
|
||||||
|
CrawlingInterval: 24 * time.Hour,
|
||||||
|
MaxPagesPerDomain: 10,
|
||||||
|
IndexRefreshInterval: 2 * time.Minute,
|
||||||
|
LogLevel: 1,
|
||||||
DriveCache: CacheConfig{
|
DriveCache: CacheConfig{
|
||||||
Duration: 48 * time.Hour, // Added
|
Duration: 48 * time.Hour, // Added
|
||||||
Path: "./cache", // Added
|
Path: "./cache", // Added
|
||||||
|
@ -238,8 +246,13 @@ func saveConfig(config Config) {
|
||||||
featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled))
|
featuresSec.Key("Crawler").SetValue(strconv.FormatBool(config.CrawlerEnabled))
|
||||||
featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled))
|
featuresSec.Key("Indexer").SetValue(strconv.FormatBool(config.IndexerEnabled))
|
||||||
featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled))
|
featuresSec.Key("Website").SetValue(strconv.FormatBool(config.WebsiteEnabled))
|
||||||
featuresSec.Key("RamCache").SetValue(strconv.FormatBool(config.RamCacheEnabled))
|
|
||||||
featuresSec.Key("DriveCache").SetValue(strconv.FormatBool(config.DriveCacheEnabled))
|
// Indexer section
|
||||||
|
indexerSec := cfg.Section("Indexer")
|
||||||
|
indexerSec.Key("ConcurrentCrawlers").SetValue(strconv.Itoa(config.ConcurrentCrawlers))
|
||||||
|
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
|
||||||
|
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
|
||||||
|
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
|
||||||
|
|
||||||
// DriveCache section
|
// DriveCache section
|
||||||
driveSec := cfg.Section("DriveCache")
|
driveSec := cfg.Section("DriveCache")
|
||||||
|
@ -266,53 +279,61 @@ func loadConfig() Config {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Server
|
// Server
|
||||||
port, _ := cfg.Section("Server").Key("Port").Int()
|
port := getConfigValue(cfg.Section("Server").Key("Port"), defaultConfig.Port, strconv.Atoi)
|
||||||
domain := cfg.Section("Server").Key("Domain").String()
|
domain := getConfigValueString(cfg.Section("Server").Key("Domain"), defaultConfig.Domain)
|
||||||
logLevel, _ := cfg.Section("Server").Key("LogLevel").Int()
|
logLevel := getConfigValue(cfg.Section("Server").Key("LogLevel"), defaultConfig.LogLevel, strconv.Atoi)
|
||||||
|
|
||||||
// Peers
|
// Peers
|
||||||
authCode := cfg.Section("Peers").Key("AuthCode").String()
|
authCode := getConfigValueString(cfg.Section("Peers").Key("AuthCode"), defaultConfig.AuthCode)
|
||||||
peersStr := cfg.Section("Peers").Key("Peers").String()
|
peers := strings.Split(getConfigValueString(cfg.Section("Peers").Key("Peers"), ""), ",")
|
||||||
peers := strings.Split(peersStr, ",")
|
|
||||||
|
|
||||||
// Features
|
// Features
|
||||||
nodesEnabled, _ := cfg.Section("Features").Key("Nodes").Bool()
|
nodesEnabled := getConfigValueBool(cfg.Section("Features").Key("Nodes"), defaultConfig.NodesEnabled)
|
||||||
crawlerEnabled, _ := cfg.Section("Features").Key("Crawler").Bool()
|
crawlerEnabled := getConfigValueBool(cfg.Section("Features").Key("Crawler"), defaultConfig.CrawlerEnabled)
|
||||||
indexerEnabled, _ := cfg.Section("Features").Key("Indexer").Bool()
|
indexerEnabled := getConfigValueBool(cfg.Section("Features").Key("Indexer"), defaultConfig.IndexerEnabled)
|
||||||
websiteEnabled, _ := cfg.Section("Features").Key("Website").Bool()
|
websiteEnabled := getConfigValueBool(cfg.Section("Features").Key("Website"), defaultConfig.WebsiteEnabled)
|
||||||
ramCacheEnabled, _ := cfg.Section("Features").Key("RamCache").Bool()
|
ramCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("RamCache"), defaultConfig.RamCacheEnabled)
|
||||||
driveCacheEnabled, _ := cfg.Section("Features").Key("DriveCache").Bool()
|
driveCacheEnabled := getConfigValueBool(cfg.Section("Features").Key("DriveCache"), defaultConfig.DriveCacheEnabled)
|
||||||
|
|
||||||
|
// Indexing
|
||||||
|
concurrentCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentCrawlers"), defaultConfig.ConcurrentCrawlers, strconv.Atoi)
|
||||||
|
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
|
||||||
|
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
|
||||||
|
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
|
||||||
|
|
||||||
// DriveCache
|
// DriveCache
|
||||||
driveDuration, _ := time.ParseDuration(cfg.Section("DriveCache").Key("Duration").String())
|
driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration)
|
||||||
drivePath := cfg.Section("DriveCache").Key("Path").String()
|
drivePath := getConfigValueString(cfg.Section("DriveCache").Key("Path"), defaultConfig.DriveCache.Path)
|
||||||
driveMaxUsage := parseMaxUsageDrive(cfg.Section("DriveCache").Key("MaxUsage").String(), drivePath)
|
driveMaxUsage := parseMaxUsageDrive(getConfigValueString(cfg.Section("DriveCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.DriveCache.MaxUsageBytes)), drivePath)
|
||||||
// maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int()
|
// maxConcurrentDownloads, _ := cfg.Section("DriveCache").Key("MaxConcurrentDownloads.Thumbnail").Int()
|
||||||
// if maxConcurrentDownloads == 0 {
|
// if maxConcurrentDownloads == 0 {
|
||||||
// maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads
|
// maxConcurrentDownloads = defaultConfig.DriveCache.MaxConcurrentThumbnailDownloads
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// RamCache
|
// RamCache
|
||||||
ramDuration, _ := time.ParseDuration(cfg.Section("RamCache").Key("Duration").String())
|
ramDuration := getConfigValue(cfg.Section("RamCache").Key("Duration"), defaultConfig.RamCache.Duration, time.ParseDuration)
|
||||||
ramMaxUsage := parseMaxUsageRam(cfg.Section("RamCache").Key("MaxUsage").String())
|
ramMaxUsage := parseMaxUsageRam(getConfigValueString(cfg.Section("RamCache").Key("MaxUsage"), formatMaxUsage(defaultConfig.RamCache.MaxUsageBytes)))
|
||||||
|
|
||||||
return Config{
|
return Config{
|
||||||
Port: port,
|
Port: port,
|
||||||
Domain: domain,
|
Domain: domain,
|
||||||
LogLevel: logLevel,
|
LogLevel: logLevel,
|
||||||
AuthCode: authCode,
|
AuthCode: authCode,
|
||||||
Peers: peers,
|
Peers: peers,
|
||||||
NodesEnabled: nodesEnabled,
|
NodesEnabled: nodesEnabled,
|
||||||
CrawlerEnabled: crawlerEnabled,
|
CrawlerEnabled: crawlerEnabled,
|
||||||
IndexerEnabled: indexerEnabled,
|
IndexerEnabled: indexerEnabled,
|
||||||
WebsiteEnabled: websiteEnabled,
|
WebsiteEnabled: websiteEnabled,
|
||||||
RamCacheEnabled: ramCacheEnabled,
|
RamCacheEnabled: ramCacheEnabled,
|
||||||
DriveCacheEnabled: driveCacheEnabled,
|
DriveCacheEnabled: driveCacheEnabled,
|
||||||
|
ConcurrentCrawlers: concurrentCrawlers,
|
||||||
|
CrawlingInterval: crawlingInterval,
|
||||||
|
MaxPagesPerDomain: maxPagesPerDomain,
|
||||||
|
IndexRefreshInterval: indexRefreshInterval,
|
||||||
DriveCache: CacheConfig{
|
DriveCache: CacheConfig{
|
||||||
Duration: driveDuration,
|
Duration: driveDuration,
|
||||||
MaxUsageBytes: driveMaxUsage,
|
MaxUsageBytes: driveMaxUsage,
|
||||||
Path: drivePath,
|
Path: drivePath,
|
||||||
// MaxConcurrentThumbnailDownloads: maxConcurrentDownloads,
|
|
||||||
},
|
},
|
||||||
RamCache: CacheConfig{
|
RamCache: CacheConfig{
|
||||||
Duration: ramDuration,
|
Duration: ramDuration,
|
||||||
|
@ -321,6 +342,34 @@ func loadConfig() Config {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getConfigValue retrieves a configuration value or returns a default value from defaultConfig.
|
||||||
|
func getConfigValue[T any](key *ini.Key, defaultValue T, parseFunc func(string) (T, error)) T {
|
||||||
|
if key == nil || key.String() == "" {
|
||||||
|
return defaultValue
|
||||||
|
}
|
||||||
|
value, err := parseFunc(key.String())
|
||||||
|
if err != nil {
|
||||||
|
return defaultValue
|
||||||
|
}
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
// getConfigValueString retrieves a string value or falls back to the default.
|
||||||
|
func getConfigValueString(key *ini.Key, defaultValue string) string {
|
||||||
|
if key == nil || key.String() == "" {
|
||||||
|
return defaultValue
|
||||||
|
}
|
||||||
|
return key.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// getConfigValueBool retrieves a boolean value or falls back to the default.
|
||||||
|
func getConfigValueBool(key *ini.Key, defaultValue bool) bool {
|
||||||
|
if key == nil || key.String() == "" {
|
||||||
|
return defaultValue
|
||||||
|
}
|
||||||
|
return key.MustBool(defaultValue)
|
||||||
|
}
|
||||||
|
|
||||||
// Helper to parse MaxUsage string into bytes
|
// Helper to parse MaxUsage string into bytes
|
||||||
func parseMaxUsageRam(value string) uint64 {
|
func parseMaxUsageRam(value string) uint64 {
|
||||||
const GiB = 1024 * 1024 * 1024
|
const GiB = 1024 * 1024 * 1024
|
||||||
|
|
84
crawler.go
84
crawler.go
|
@ -7,6 +7,7 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
|
@ -18,8 +19,8 @@ func webCrawlerInit() {
|
||||||
// First run immediately
|
// First run immediately
|
||||||
runCrawlerAndIndexer()
|
runCrawlerAndIndexer()
|
||||||
|
|
||||||
// Then every 24h (adjust as needed)
|
// Then run periodically based on CrawlingInterval
|
||||||
ticker := time.NewTicker(24 * time.Hour)
|
ticker := time.NewTicker(config.CrawlingInterval)
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
runCrawlerAndIndexer()
|
runCrawlerAndIndexer()
|
||||||
}
|
}
|
||||||
|
@ -37,16 +38,13 @@ func runCrawlerAndIndexer() {
|
||||||
|
|
||||||
// 2. Crawl each domain and write results to data_to_index.txt
|
// 2. Crawl each domain and write results to data_to_index.txt
|
||||||
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
||||||
if err := crawlDomainsToFile(domains, outFile); err != nil {
|
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain, config.ConcurrentCrawlers); err != nil {
|
||||||
printErr("Error crawling domains: %v", err)
|
printErr("Error crawling domains: %v", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Re-index data_to_index.txt
|
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
|
||||||
if err := IndexFile(outFile); err != nil {
|
startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
||||||
printErr("Error indexing data_to_index.txt: %v", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
printDebug("Crawl + index refresh completed.")
|
printDebug("Crawl + index refresh completed.")
|
||||||
}
|
}
|
||||||
|
@ -81,10 +79,11 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
|
// crawlDomainsToFile visits each domain, extracts minimal data, writes results to outFile
|
||||||
func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int, concurrentCrawlers int) error {
|
||||||
// Read existing data_to_index.txt into a map to prevent duplicates
|
|
||||||
existingEntries := make(map[string]bool)
|
existingEntries := make(map[string]bool)
|
||||||
if _, err := os.Stat(outFile); err == nil { // File exists
|
var mu sync.Mutex // Mutex to protect access to the map
|
||||||
|
|
||||||
|
if _, err := os.Stat(outFile); err == nil {
|
||||||
file, err := os.Open(outFile)
|
file, err := os.Open(outFile)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("unable to open %s: %v", outFile, err)
|
return fmt.Errorf("unable to open %s: %v", outFile, err)
|
||||||
|
@ -96,7 +95,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
parts := strings.SplitN(line, "|", 5)
|
parts := strings.SplitN(line, "|", 5)
|
||||||
if len(parts) >= 1 {
|
if len(parts) >= 1 {
|
||||||
existingEntries[parts[0]] = true // Mark existing domain
|
existingEntries[parts[0]] = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -108,37 +107,48 @@ func crawlDomainsToFile(domains [][2]string, outFile string) error {
|
||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
|
semaphore := make(chan struct{}, concurrentCrawlers)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
for _, d := range domains {
|
for _, d := range domains {
|
||||||
rank := d[0]
|
wg.Add(1)
|
||||||
domain := d[1]
|
semaphore <- struct{}{}
|
||||||
if domain == "" || existingEntries["https://"+domain] {
|
go func(domain [2]string) {
|
||||||
continue
|
defer wg.Done()
|
||||||
}
|
defer func() { <-semaphore }()
|
||||||
|
|
||||||
fullURL := "https://" + domain
|
rank := domain[0]
|
||||||
title, desc, keywords := fetchPageMetadata(fullURL)
|
domainName := domain[1]
|
||||||
if title == "" {
|
fullURL := "https://" + domainName
|
||||||
title = "Unknown Title"
|
|
||||||
}
|
|
||||||
if desc == "" {
|
|
||||||
desc = "No Description"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write unique domain to file
|
mu.Lock()
|
||||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
if domainName == "" || existingEntries[fullURL] {
|
||||||
fullURL,
|
mu.Unlock()
|
||||||
sanitize(title),
|
return
|
||||||
sanitize(keywords),
|
}
|
||||||
sanitize(desc),
|
existingEntries[fullURL] = true
|
||||||
rank,
|
mu.Unlock()
|
||||||
)
|
|
||||||
if _, err := file.WriteString(line); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
existingEntries[fullURL] = true
|
title, desc, keywords := fetchPageMetadata(fullURL)
|
||||||
|
if title == "" {
|
||||||
|
title = "Unknown Title"
|
||||||
|
}
|
||||||
|
if desc == "" {
|
||||||
|
desc = "No Description"
|
||||||
|
}
|
||||||
|
|
||||||
|
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||||
|
fullURL,
|
||||||
|
sanitize(title),
|
||||||
|
sanitize(keywords),
|
||||||
|
sanitize(desc),
|
||||||
|
rank,
|
||||||
|
)
|
||||||
|
file.WriteString(line)
|
||||||
|
}(d)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,12 +28,12 @@ var (
|
||||||
bleveIndex bleve.Index
|
bleveIndex bleve.Index
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// startPeriodicIndexing refreshes the index from a file periodically
|
||||||
func startPeriodicIndexing(filePath string, interval time.Duration) {
|
func startPeriodicIndexing(filePath string, interval time.Duration) {
|
||||||
go func() {
|
go func() {
|
||||||
for {
|
for {
|
||||||
printDebug("Refreshing index from %s", filePath)
|
printDebug("Refreshing index from %s", filePath)
|
||||||
err := IndexFile(filePath)
|
if err := IndexFile(filePath); err != nil {
|
||||||
if err != nil {
|
|
||||||
printErr("Failed to refresh index: %v", err)
|
printErr("Failed to refresh index: %v", err)
|
||||||
}
|
}
|
||||||
time.Sleep(interval)
|
time.Sleep(interval)
|
||||||
|
@ -139,7 +139,7 @@ func IndexFile(filePath string) error {
|
||||||
return fmt.Errorf("error reading file: %v", err)
|
return fmt.Errorf("error reading file: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
printDebug("Indexed %d unique domains from %s\n", len(indexedDomains), filePath)
|
printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue