changed indexing buffer to save to ram not to file

This commit is contained in:
partisan 2025-01-02 12:55:44 +01:00
parent 918e1823df
commit 61266c461a
4 changed files with 155 additions and 62 deletions

View file

@ -39,7 +39,7 @@ type Config struct {
ConcurrentChromeCrawlers int ConcurrentChromeCrawlers int
CrawlingInterval time.Duration // Refres crawled results in... CrawlingInterval time.Duration // Refres crawled results in...
MaxPagesPerDomain int // Max pages to crawl per domain MaxPagesPerDomain int // Max pages to crawl per domain
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m") IndexBatchSize int
DriveCache CacheConfig DriveCache CacheConfig
RamCache CacheConfig RamCache CacheConfig
@ -60,7 +60,7 @@ var defaultConfig = Config{
ConcurrentChromeCrawlers: 4, ConcurrentChromeCrawlers: 4,
CrawlingInterval: 24 * time.Hour, CrawlingInterval: 24 * time.Hour,
MaxPagesPerDomain: 10, MaxPagesPerDomain: 10,
IndexRefreshInterval: 2 * time.Minute, IndexBatchSize: 50,
LogLevel: 1, LogLevel: 1,
DriveCache: CacheConfig{ DriveCache: CacheConfig{
Duration: 48 * time.Hour, // Added Duration: 48 * time.Hour, // Added
@ -255,7 +255,7 @@ func saveConfig(config Config) {
indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers)) indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String()) indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain)) indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String()) indexerSec.Key("IndexBatchSize").SetValue(strconv.Itoa(config.IndexBatchSize))
// DriveCache section // DriveCache section
driveSec := cfg.Section("DriveCache") driveSec := cfg.Section("DriveCache")
@ -303,7 +303,7 @@ func loadConfig() Config {
concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi) concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi)
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration) crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi) maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration) indexBatchSize := getConfigValue(cfg.Section("Indexer").Key("IndexBatchSize"), defaultConfig.IndexBatchSize, strconv.Atoi)
// DriveCache // DriveCache
driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration) driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration)
@ -334,7 +334,7 @@ func loadConfig() Config {
ConcurrentChromeCrawlers: concurrentChromeCrawlers, ConcurrentChromeCrawlers: concurrentChromeCrawlers,
CrawlingInterval: crawlingInterval, CrawlingInterval: crawlingInterval,
MaxPagesPerDomain: maxPagesPerDomain, MaxPagesPerDomain: maxPagesPerDomain,
IndexRefreshInterval: indexRefreshInterval, IndexBatchSize: indexBatchSize,
DriveCache: CacheConfig{ DriveCache: CacheConfig{
Duration: driveDuration, Duration: driveDuration,
MaxUsageBytes: driveMaxUsage, MaxUsageBytes: driveMaxUsage,

View file

@ -2,7 +2,6 @@ package main
import ( import (
"bufio" "bufio"
"fmt"
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
@ -45,14 +44,20 @@ func runCrawlerAndIndexer() {
} }
// 2. Crawl each domain and write results to data_to_index.txt // 2. Crawl each domain and write results to data_to_index.txt
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt") if err := crawlDomainsToFile(domains, config.MaxPagesPerDomain); err != nil {
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
printErr("Error crawling domains: %v", err) printErr("Error crawling domains: %v", err)
return return
} }
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval // After finishing crawling, flush any pending visited-urls
startPeriodicIndexing(outFile, config.IndexRefreshInterval) if visitedStore != nil {
if err := visitedStore.Flush(); err != nil {
printErr("Failed to flush visitedStore: %v", err)
}
}
// 3. Re-index data_to_index.txt based on IndexRefreshInterval
//startPeriodicIndexing(outFile, config.IndexRefreshInterval)
printDebug("Crawl + index refresh completed.") printDebug("Crawl + index refresh completed.")
} }
@ -89,16 +94,10 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
// crawlDomainsToFile does an async pipeline: // crawlDomainsToFile does an async pipeline:
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh // 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip // 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error { //
// Now, instead of writing to a file, we directly index each result into Bleve via indexDocImmediately(...).
var mu sync.Mutex func crawlDomainsToFile(domains [][2]string, maxPages int) error {
var mu sync.Mutex // Used if needed to protect shared data. (Here mainly for visitedStore.)
// Open file for writing (truncate if existing)
file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
if err != nil {
return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
}
defer file.Close()
// Prepare channels // Prepare channels
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
@ -110,6 +109,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
wgStandard.Add(1) wgStandard.Add(1)
go func() { go func() {
defer wgStandard.Done() defer wgStandard.Done()
for dom := range standardCh { for dom := range standardCh {
rank := dom[0] rank := dom[0]
domainName := dom[1] domainName := dom[1]
@ -118,14 +118,17 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
} }
fullURL := "https://" + domainName fullURL := "https://" + domainName
// 1. Check if we've already visited this URL // Mark visited so we don't re-crawl duplicates
mu.Lock()
added, err := visitedStore.MarkVisited(fullURL) added, err := visitedStore.MarkVisited(fullURL)
mu.Unlock()
if err != nil { if err != nil {
printErr("MarkVisited error for %s: %v", fullURL, err) printErr("MarkVisited error for %s: %v", fullURL, err)
continue continue
} }
if !added { if !added {
// Already visited // Already visited, skip
continue continue
} }
@ -139,13 +142,11 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
continue continue
} }
// 3. Write to file // 3. Directly index
line := fmt.Sprintf("%s|%s|%s|%s|%s\n", err = indexDocImmediately(fullURL, title, keywords, desc, rank)
fullURL, title, keywords, desc, rank) if err != nil {
printErr("Index error for %s: %v", fullURL, err)
mu.Lock() }
_, _ = file.WriteString(line)
mu.Unlock()
} }
}() }()
} }
@ -156,6 +157,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
wgChrome.Add(1) wgChrome.Add(1)
go func() { go func() {
defer wgChrome.Done() defer wgChrome.Done()
for dom := range chromeCh { for dom := range chromeCh {
rank := dom[0] rank := dom[0]
domainName := dom[1] domainName := dom[1]
@ -164,28 +166,19 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
} }
fullURL := "https://" + domainName fullURL := "https://" + domainName
// We already marked it visited in the standard pass
// but you may re-check if you prefer:
//
// added, err := visitedStore.MarkVisited(fullURL)
// if err != nil { ... }
// if !added { continue }
// 3. Chromedp fallback extraction // 3. Chromedp fallback extraction
userAgent, _ := GetUserAgent("crawler-chrome") userAgent, _ := GetUserAgent("crawler-chrome")
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent) title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
if title == "" || desc == "" { if title == "" || desc == "" {
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL) printWarn("Skipping %s: unable to get title/desc data", fullURL)
continue continue
} }
// 4. Write to file // 4. Directly index the doc
line := fmt.Sprintf("%s|%s|%s|%s|%s\n", err := indexDocImmediately(fullURL, title, keywords, desc, rank)
fullURL, title, keywords, desc, rank) if err != nil {
printErr("Index error for %s: %v", fullURL, err)
mu.Lock() }
_, _ = file.WriteString(line)
mu.Unlock()
} }
}() }()
} }
@ -195,7 +188,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
for _, dom := range domains { for _, dom := range domains {
standardCh <- dom standardCh <- dom
} }
// close the standardCh once all are queued
close(standardCh) close(standardCh)
}() }()
@ -208,7 +200,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
// Wait for chrome workers to finish // Wait for chrome workers to finish
wgChrome.Wait() wgChrome.Wait()
// Optionally flush the visited store once more // Flush visitedStore
if visitedStore != nil { if visitedStore != nil {
if err := visitedStore.Flush(); err != nil { if err := visitedStore.Flush(); err != nil {
printErr("visitedStore flush error: %v", err) printErr("visitedStore flush error: %v", err)

View file

@ -8,6 +8,7 @@ import (
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"sync"
"time" "time"
"github.com/blevesearch/bleve/v2" "github.com/blevesearch/bleve/v2"
@ -27,21 +28,122 @@ type Document struct {
var ( var (
// Global Bleve index handle // Global Bleve index handle
bleveIndex bleve.Index bleveIndex bleve.Index
docBuffer []Document
docBufferMu sync.Mutex
) )
// startPeriodicIndexing refreshes the index from a file periodically // // startPeriodicIndexing refreshes the index from a file periodically
func startPeriodicIndexing(filePath string, interval time.Duration) { // func startPeriodicIndexing(filePath string, interval time.Duration) {
go func() { // go func() {
for { // for {
printDebug("Refreshing index from %s", filePath) // printDebug("Refreshing index from %s", filePath)
if err := IndexFile(filePath); err != nil { // if err := IndexFile(filePath); err != nil {
printErr("Failed to refresh index: %v", err) // printErr("Failed to refresh index: %v", err)
// }
// time.Sleep(interval)
// }
// }()
// }
// indexDocImmediately indexes a single document into the Bleve index.
func indexDocImmediately(link, title, tags, desc, rank string) error {
pop, _ := strconv.ParseInt(rank, 10, 64)
normalized := normalizeDomain(link)
doc := Document{
ID: normalized,
Link: link,
Title: title,
Tags: tags,
Description: desc,
Popularity: pop,
} }
time.Sleep(interval)
// Insert directly into the Bleve index
err := bleveIndex.Index(doc.ID, map[string]interface{}{
"title": doc.Title,
"description": doc.Description,
"link": doc.Link,
"tags": doc.Tags,
"popularity": doc.Popularity,
})
if err != nil {
return fmt.Errorf("failed to index doc %s: %v", link, err)
}
return nil
}
// StartBatchIndexing spawns a goroutine that flushes the buffer every interval.
func StartBatchIndexing() {
go func() {
ticker := time.NewTicker(config.IndexRefreshInterval)
defer ticker.Stop()
for range ticker.C {
flushDocBuffer()
} }
}() }()
} }
func flushDocBuffer() {
docBufferMu.Lock()
defer docBufferMu.Unlock()
if len(docBuffer) == 0 {
return
}
batch := bleveIndex.NewBatch()
for _, doc := range docBuffer {
err := batch.Index(doc.ID, map[string]interface{}{
"title": doc.Title,
"description": doc.Description,
"link": doc.Link,
"tags": doc.Tags,
"popularity": doc.Popularity,
})
if err != nil {
printErr("batch index error for %s: %v", doc.Link, err)
}
}
// Attempt to commit the batch
if err := bleveIndex.Batch(batch); err != nil {
printErr("error committing batch: %v", err)
}
// Clear the buffer
docBuffer = docBuffer[:0]
}
// indexDocBatch queues a single document into memory, which gets flushed by the ticker.
func indexDocBatch(link, title, tags, desc, rank string) error {
pop, _ := strconv.ParseInt(rank, 10, 64)
normalized := normalizeDomain(link)
doc := Document{
ID: normalized,
Link: link,
Title: title,
Tags: tags,
Description: desc,
Popularity: pop,
}
docBufferMu.Lock()
docBuffer = append(docBuffer, doc)
// Optional: if we exceed config.IndexBatchSize, flush immediately
if len(docBuffer) >= config.IndexBatchSize {
go func() {
// flush in a separate goroutine to avoid blocking
flushDocBuffer()
}()
}
docBufferMu.Unlock()
return nil
}
// InitIndex ensures that the Bleve index is created or opened. // InitIndex ensures that the Bleve index is created or opened.
func InitIndex() error { func InitIndex() error {
idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve")) idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve"))

13
init.go
View file

@ -3,8 +3,6 @@ package main
import ( import (
"flag" "flag"
"os" "os"
"path/filepath"
"time"
) )
var config Config var config Config
@ -109,16 +107,17 @@ func main() {
return return
} }
webCrawlerInit()
err := InitIndex() err := InitIndex()
if err != nil { if err != nil {
printErr("Failed to initialize index:", err) printErr("Failed to initialize index:", err)
} }
// Start periodic indexing (every 2 minutes) webCrawlerInit()
dataFilePath := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
startPeriodicIndexing(dataFilePath, 2*time.Minute) // No longer needed as crawled data are indexed imidietly
// // Start periodic indexing (every 2 minutes)
// dataFilePath := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
// startPeriodicIndexing(dataFilePath, 2*time.Minute)
printInfo("Indexer is enabled.") printInfo("Indexer is enabled.")
} else { } else {