changed indexing buffer to save to ram not to file
This commit is contained in:
parent
918e1823df
commit
61266c461a
4 changed files with 155 additions and 62 deletions
10
config.go
10
config.go
|
@ -39,7 +39,7 @@ type Config struct {
|
||||||
ConcurrentChromeCrawlers int
|
ConcurrentChromeCrawlers int
|
||||||
CrawlingInterval time.Duration // Refres crawled results in...
|
CrawlingInterval time.Duration // Refres crawled results in...
|
||||||
MaxPagesPerDomain int // Max pages to crawl per domain
|
MaxPagesPerDomain int // Max pages to crawl per domain
|
||||||
IndexRefreshInterval time.Duration // Interval for periodic index refresh (e.g., "10m")
|
IndexBatchSize int
|
||||||
|
|
||||||
DriveCache CacheConfig
|
DriveCache CacheConfig
|
||||||
RamCache CacheConfig
|
RamCache CacheConfig
|
||||||
|
@ -60,7 +60,7 @@ var defaultConfig = Config{
|
||||||
ConcurrentChromeCrawlers: 4,
|
ConcurrentChromeCrawlers: 4,
|
||||||
CrawlingInterval: 24 * time.Hour,
|
CrawlingInterval: 24 * time.Hour,
|
||||||
MaxPagesPerDomain: 10,
|
MaxPagesPerDomain: 10,
|
||||||
IndexRefreshInterval: 2 * time.Minute,
|
IndexBatchSize: 50,
|
||||||
LogLevel: 1,
|
LogLevel: 1,
|
||||||
DriveCache: CacheConfig{
|
DriveCache: CacheConfig{
|
||||||
Duration: 48 * time.Hour, // Added
|
Duration: 48 * time.Hour, // Added
|
||||||
|
@ -255,7 +255,7 @@ func saveConfig(config Config) {
|
||||||
indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
|
indexerSec.Key("ConcurrentChromeCrawlers").SetValue(strconv.Itoa(config.ConcurrentStandardCrawlers))
|
||||||
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
|
indexerSec.Key("CrawlingInterval").SetValue(config.CrawlingInterval.String())
|
||||||
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
|
indexerSec.Key("MaxPagesPerDomain").SetValue(strconv.Itoa(config.MaxPagesPerDomain))
|
||||||
indexerSec.Key("IndexRefreshInterval").SetValue(config.IndexRefreshInterval.String())
|
indexerSec.Key("IndexBatchSize").SetValue(strconv.Itoa(config.IndexBatchSize))
|
||||||
|
|
||||||
// DriveCache section
|
// DriveCache section
|
||||||
driveSec := cfg.Section("DriveCache")
|
driveSec := cfg.Section("DriveCache")
|
||||||
|
@ -303,7 +303,7 @@ func loadConfig() Config {
|
||||||
concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi)
|
concurrentChromeCrawlers := getConfigValue(cfg.Section("Indexer").Key("ConcurrentChromeCrawlers"), defaultConfig.ConcurrentChromeCrawlers, strconv.Atoi)
|
||||||
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
|
crawlingInterval := getConfigValue(cfg.Section("Indexer").Key("CrawlingInterval"), defaultConfig.CrawlingInterval, time.ParseDuration)
|
||||||
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
|
maxPagesPerDomain := getConfigValue(cfg.Section("Indexer").Key("MaxPagesPerDomain"), defaultConfig.MaxPagesPerDomain, strconv.Atoi)
|
||||||
indexRefreshInterval := getConfigValue(cfg.Section("Indexer").Key("IndexRefreshInterval"), defaultConfig.IndexRefreshInterval, time.ParseDuration)
|
indexBatchSize := getConfigValue(cfg.Section("Indexer").Key("IndexBatchSize"), defaultConfig.IndexBatchSize, strconv.Atoi)
|
||||||
|
|
||||||
// DriveCache
|
// DriveCache
|
||||||
driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration)
|
driveDuration := getConfigValue(cfg.Section("DriveCache").Key("Duration"), defaultConfig.DriveCache.Duration, time.ParseDuration)
|
||||||
|
@ -334,7 +334,7 @@ func loadConfig() Config {
|
||||||
ConcurrentChromeCrawlers: concurrentChromeCrawlers,
|
ConcurrentChromeCrawlers: concurrentChromeCrawlers,
|
||||||
CrawlingInterval: crawlingInterval,
|
CrawlingInterval: crawlingInterval,
|
||||||
MaxPagesPerDomain: maxPagesPerDomain,
|
MaxPagesPerDomain: maxPagesPerDomain,
|
||||||
IndexRefreshInterval: indexRefreshInterval,
|
IndexBatchSize: indexBatchSize,
|
||||||
DriveCache: CacheConfig{
|
DriveCache: CacheConfig{
|
||||||
Duration: driveDuration,
|
Duration: driveDuration,
|
||||||
MaxUsageBytes: driveMaxUsage,
|
MaxUsageBytes: driveMaxUsage,
|
||||||
|
|
74
crawler.go
74
crawler.go
|
@ -2,7 +2,6 @@ package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
@ -45,14 +44,20 @@ func runCrawlerAndIndexer() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. Crawl each domain and write results to data_to_index.txt
|
// 2. Crawl each domain and write results to data_to_index.txt
|
||||||
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
if err := crawlDomainsToFile(domains, config.MaxPagesPerDomain); err != nil {
|
||||||
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
|
|
||||||
printErr("Error crawling domains: %v", err)
|
printErr("Error crawling domains: %v", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
|
// After finishing crawling, flush any pending visited-urls
|
||||||
startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
if visitedStore != nil {
|
||||||
|
if err := visitedStore.Flush(); err != nil {
|
||||||
|
printErr("Failed to flush visitedStore: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Re-index data_to_index.txt based on IndexRefreshInterval
|
||||||
|
//startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
||||||
|
|
||||||
printDebug("Crawl + index refresh completed.")
|
printDebug("Crawl + index refresh completed.")
|
||||||
}
|
}
|
||||||
|
@ -89,16 +94,10 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
|
||||||
// crawlDomainsToFile does an async pipeline:
|
// crawlDomainsToFile does an async pipeline:
|
||||||
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
|
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
|
||||||
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
|
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
|
||||||
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
|
//
|
||||||
|
// Now, instead of writing to a file, we directly index each result into Bleve via indexDocImmediately(...).
|
||||||
var mu sync.Mutex
|
func crawlDomainsToFile(domains [][2]string, maxPages int) error {
|
||||||
|
var mu sync.Mutex // Used if needed to protect shared data. (Here mainly for visitedStore.)
|
||||||
// Open file for writing (truncate if existing)
|
|
||||||
file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
|
|
||||||
}
|
|
||||||
defer file.Close()
|
|
||||||
|
|
||||||
// Prepare channels
|
// Prepare channels
|
||||||
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
|
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
|
||||||
|
@ -110,6 +109,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
||||||
wgStandard.Add(1)
|
wgStandard.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wgStandard.Done()
|
defer wgStandard.Done()
|
||||||
|
|
||||||
for dom := range standardCh {
|
for dom := range standardCh {
|
||||||
rank := dom[0]
|
rank := dom[0]
|
||||||
domainName := dom[1]
|
domainName := dom[1]
|
||||||
|
@ -118,14 +118,17 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
||||||
}
|
}
|
||||||
fullURL := "https://" + domainName
|
fullURL := "https://" + domainName
|
||||||
|
|
||||||
// 1. Check if we've already visited this URL
|
// Mark visited so we don't re-crawl duplicates
|
||||||
|
mu.Lock()
|
||||||
added, err := visitedStore.MarkVisited(fullURL)
|
added, err := visitedStore.MarkVisited(fullURL)
|
||||||
|
mu.Unlock()
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
printErr("MarkVisited error for %s: %v", fullURL, err)
|
printErr("MarkVisited error for %s: %v", fullURL, err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if !added {
|
if !added {
|
||||||
// Already visited
|
// Already visited, skip
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -139,13 +142,11 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Write to file
|
// 3. Directly index
|
||||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
err = indexDocImmediately(fullURL, title, keywords, desc, rank)
|
||||||
fullURL, title, keywords, desc, rank)
|
if err != nil {
|
||||||
|
printErr("Index error for %s: %v", fullURL, err)
|
||||||
mu.Lock()
|
}
|
||||||
_, _ = file.WriteString(line)
|
|
||||||
mu.Unlock()
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
@ -156,6 +157,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
||||||
wgChrome.Add(1)
|
wgChrome.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wgChrome.Done()
|
defer wgChrome.Done()
|
||||||
|
|
||||||
for dom := range chromeCh {
|
for dom := range chromeCh {
|
||||||
rank := dom[0]
|
rank := dom[0]
|
||||||
domainName := dom[1]
|
domainName := dom[1]
|
||||||
|
@ -164,28 +166,19 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
||||||
}
|
}
|
||||||
fullURL := "https://" + domainName
|
fullURL := "https://" + domainName
|
||||||
|
|
||||||
// We already marked it visited in the standard pass
|
|
||||||
// but you may re-check if you prefer:
|
|
||||||
//
|
|
||||||
// added, err := visitedStore.MarkVisited(fullURL)
|
|
||||||
// if err != nil { ... }
|
|
||||||
// if !added { continue }
|
|
||||||
|
|
||||||
// 3. Chromedp fallback extraction
|
// 3. Chromedp fallback extraction
|
||||||
userAgent, _ := GetUserAgent("crawler-chrome")
|
userAgent, _ := GetUserAgent("crawler-chrome")
|
||||||
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
|
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
|
||||||
if title == "" || desc == "" {
|
if title == "" || desc == "" {
|
||||||
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
|
printWarn("Skipping %s: unable to get title/desc data", fullURL)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. Write to file
|
// 4. Directly index the doc
|
||||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
err := indexDocImmediately(fullURL, title, keywords, desc, rank)
|
||||||
fullURL, title, keywords, desc, rank)
|
if err != nil {
|
||||||
|
printErr("Index error for %s: %v", fullURL, err)
|
||||||
mu.Lock()
|
}
|
||||||
_, _ = file.WriteString(line)
|
|
||||||
mu.Unlock()
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
@ -195,7 +188,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
||||||
for _, dom := range domains {
|
for _, dom := range domains {
|
||||||
standardCh <- dom
|
standardCh <- dom
|
||||||
}
|
}
|
||||||
// close the standardCh once all are queued
|
|
||||||
close(standardCh)
|
close(standardCh)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
@ -208,7 +200,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
||||||
// Wait for chrome workers to finish
|
// Wait for chrome workers to finish
|
||||||
wgChrome.Wait()
|
wgChrome.Wait()
|
||||||
|
|
||||||
// Optionally flush the visited store once more
|
// Flush visitedStore
|
||||||
if visitedStore != nil {
|
if visitedStore != nil {
|
||||||
if err := visitedStore.Flush(); err != nil {
|
if err := visitedStore.Flush(); err != nil {
|
||||||
printErr("visitedStore flush error: %v", err)
|
printErr("visitedStore flush error: %v", err)
|
||||||
|
|
118
indexer.go
118
indexer.go
|
@ -8,6 +8,7 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/v2"
|
"github.com/blevesearch/bleve/v2"
|
||||||
|
@ -27,21 +28,122 @@ type Document struct {
|
||||||
var (
|
var (
|
||||||
// Global Bleve index handle
|
// Global Bleve index handle
|
||||||
bleveIndex bleve.Index
|
bleveIndex bleve.Index
|
||||||
|
docBuffer []Document
|
||||||
|
docBufferMu sync.Mutex
|
||||||
)
|
)
|
||||||
|
|
||||||
// startPeriodicIndexing refreshes the index from a file periodically
|
// // startPeriodicIndexing refreshes the index from a file periodically
|
||||||
func startPeriodicIndexing(filePath string, interval time.Duration) {
|
// func startPeriodicIndexing(filePath string, interval time.Duration) {
|
||||||
go func() {
|
// go func() {
|
||||||
for {
|
// for {
|
||||||
printDebug("Refreshing index from %s", filePath)
|
// printDebug("Refreshing index from %s", filePath)
|
||||||
if err := IndexFile(filePath); err != nil {
|
// if err := IndexFile(filePath); err != nil {
|
||||||
printErr("Failed to refresh index: %v", err)
|
// printErr("Failed to refresh index: %v", err)
|
||||||
|
// }
|
||||||
|
// time.Sleep(interval)
|
||||||
|
// }
|
||||||
|
// }()
|
||||||
|
// }
|
||||||
|
|
||||||
|
// indexDocImmediately indexes a single document into the Bleve index.
|
||||||
|
func indexDocImmediately(link, title, tags, desc, rank string) error {
|
||||||
|
pop, _ := strconv.ParseInt(rank, 10, 64)
|
||||||
|
normalized := normalizeDomain(link)
|
||||||
|
|
||||||
|
doc := Document{
|
||||||
|
ID: normalized,
|
||||||
|
Link: link,
|
||||||
|
Title: title,
|
||||||
|
Tags: tags,
|
||||||
|
Description: desc,
|
||||||
|
Popularity: pop,
|
||||||
}
|
}
|
||||||
time.Sleep(interval)
|
|
||||||
|
// Insert directly into the Bleve index
|
||||||
|
err := bleveIndex.Index(doc.ID, map[string]interface{}{
|
||||||
|
"title": doc.Title,
|
||||||
|
"description": doc.Description,
|
||||||
|
"link": doc.Link,
|
||||||
|
"tags": doc.Tags,
|
||||||
|
"popularity": doc.Popularity,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to index doc %s: %v", link, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// StartBatchIndexing spawns a goroutine that flushes the buffer every interval.
|
||||||
|
func StartBatchIndexing() {
|
||||||
|
go func() {
|
||||||
|
ticker := time.NewTicker(config.IndexRefreshInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for range ticker.C {
|
||||||
|
flushDocBuffer()
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func flushDocBuffer() {
|
||||||
|
docBufferMu.Lock()
|
||||||
|
defer docBufferMu.Unlock()
|
||||||
|
|
||||||
|
if len(docBuffer) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
batch := bleveIndex.NewBatch()
|
||||||
|
for _, doc := range docBuffer {
|
||||||
|
err := batch.Index(doc.ID, map[string]interface{}{
|
||||||
|
"title": doc.Title,
|
||||||
|
"description": doc.Description,
|
||||||
|
"link": doc.Link,
|
||||||
|
"tags": doc.Tags,
|
||||||
|
"popularity": doc.Popularity,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
printErr("batch index error for %s: %v", doc.Link, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Attempt to commit the batch
|
||||||
|
if err := bleveIndex.Batch(batch); err != nil {
|
||||||
|
printErr("error committing batch: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clear the buffer
|
||||||
|
docBuffer = docBuffer[:0]
|
||||||
|
}
|
||||||
|
|
||||||
|
// indexDocBatch queues a single document into memory, which gets flushed by the ticker.
|
||||||
|
func indexDocBatch(link, title, tags, desc, rank string) error {
|
||||||
|
pop, _ := strconv.ParseInt(rank, 10, 64)
|
||||||
|
normalized := normalizeDomain(link)
|
||||||
|
|
||||||
|
doc := Document{
|
||||||
|
ID: normalized,
|
||||||
|
Link: link,
|
||||||
|
Title: title,
|
||||||
|
Tags: tags,
|
||||||
|
Description: desc,
|
||||||
|
Popularity: pop,
|
||||||
|
}
|
||||||
|
|
||||||
|
docBufferMu.Lock()
|
||||||
|
docBuffer = append(docBuffer, doc)
|
||||||
|
|
||||||
|
// Optional: if we exceed config.IndexBatchSize, flush immediately
|
||||||
|
if len(docBuffer) >= config.IndexBatchSize {
|
||||||
|
go func() {
|
||||||
|
// flush in a separate goroutine to avoid blocking
|
||||||
|
flushDocBuffer()
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
docBufferMu.Unlock()
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// InitIndex ensures that the Bleve index is created or opened.
|
// InitIndex ensures that the Bleve index is created or opened.
|
||||||
func InitIndex() error {
|
func InitIndex() error {
|
||||||
idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve"))
|
idx, err := bleve.Open(filepath.Join(config.DriveCache.Path, "index.bleve"))
|
||||||
|
|
13
init.go
13
init.go
|
@ -3,8 +3,6 @@ package main
|
||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var config Config
|
var config Config
|
||||||
|
@ -109,16 +107,17 @@ func main() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
webCrawlerInit()
|
|
||||||
|
|
||||||
err := InitIndex()
|
err := InitIndex()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
printErr("Failed to initialize index:", err)
|
printErr("Failed to initialize index:", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start periodic indexing (every 2 minutes)
|
webCrawlerInit()
|
||||||
dataFilePath := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
|
||||||
startPeriodicIndexing(dataFilePath, 2*time.Minute)
|
// No longer needed as crawled data are indexed imidietly
|
||||||
|
// // Start periodic indexing (every 2 minutes)
|
||||||
|
// dataFilePath := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
||||||
|
// startPeriodicIndexing(dataFilePath, 2*time.Minute)
|
||||||
|
|
||||||
printInfo("Indexer is enabled.")
|
printInfo("Indexer is enabled.")
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Add table
Reference in a new issue