changed indexing buffer to save to ram not to file
This commit is contained in:
parent
918e1823df
commit
61266c461a
4 changed files with 155 additions and 62 deletions
74
crawler.go
74
crawler.go
|
@ -2,7 +2,6 @@ package main
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
@ -45,14 +44,20 @@ func runCrawlerAndIndexer() {
|
|||
}
|
||||
|
||||
// 2. Crawl each domain and write results to data_to_index.txt
|
||||
outFile := filepath.Join(config.DriveCache.Path, "data_to_index.txt")
|
||||
if err := crawlDomainsToFile(domains, outFile, config.MaxPagesPerDomain); err != nil {
|
||||
if err := crawlDomainsToFile(domains, config.MaxPagesPerDomain); err != nil {
|
||||
printErr("Error crawling domains: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
// 3. Re-index data_to_index.txt periodically based on IndexRefreshInterval
|
||||
startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
||||
// After finishing crawling, flush any pending visited-urls
|
||||
if visitedStore != nil {
|
||||
if err := visitedStore.Flush(); err != nil {
|
||||
printErr("Failed to flush visitedStore: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Re-index data_to_index.txt based on IndexRefreshInterval
|
||||
//startPeriodicIndexing(outFile, config.IndexRefreshInterval)
|
||||
|
||||
printDebug("Crawl + index refresh completed.")
|
||||
}
|
||||
|
@ -89,16 +94,10 @@ func readDomainsCSV(csvPath string) ([][2]string, error) {
|
|||
// crawlDomainsToFile does an async pipeline:
|
||||
// 1. "standard" goroutines read from standardCh -> attempt standard extraction -> if fails, push to chromeCh
|
||||
// 2. "chrome" goroutines read from chromeCh -> attempt chromedp extraction -> if fails, skip
|
||||
func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error {
|
||||
|
||||
var mu sync.Mutex
|
||||
|
||||
// Open file for writing (truncate if existing)
|
||||
file, err := os.OpenFile(outFile, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to open %s for writing: %v", outFile, err)
|
||||
}
|
||||
defer file.Close()
|
||||
//
|
||||
// Now, instead of writing to a file, we directly index each result into Bleve via indexDocImmediately(...).
|
||||
func crawlDomainsToFile(domains [][2]string, maxPages int) error {
|
||||
var mu sync.Mutex // Used if needed to protect shared data. (Here mainly for visitedStore.)
|
||||
|
||||
// Prepare channels
|
||||
standardCh := make(chan [2]string, 1000) // buffered channels help avoid blocking
|
||||
|
@ -110,6 +109,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
wgStandard.Add(1)
|
||||
go func() {
|
||||
defer wgStandard.Done()
|
||||
|
||||
for dom := range standardCh {
|
||||
rank := dom[0]
|
||||
domainName := dom[1]
|
||||
|
@ -118,14 +118,17 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
}
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// 1. Check if we've already visited this URL
|
||||
// Mark visited so we don't re-crawl duplicates
|
||||
mu.Lock()
|
||||
added, err := visitedStore.MarkVisited(fullURL)
|
||||
mu.Unlock()
|
||||
|
||||
if err != nil {
|
||||
printErr("MarkVisited error for %s: %v", fullURL, err)
|
||||
continue
|
||||
}
|
||||
if !added {
|
||||
// Already visited
|
||||
// Already visited, skip
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -139,13 +142,11 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
continue
|
||||
}
|
||||
|
||||
// 3. Write to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL, title, keywords, desc, rank)
|
||||
|
||||
mu.Lock()
|
||||
_, _ = file.WriteString(line)
|
||||
mu.Unlock()
|
||||
// 3. Directly index
|
||||
err = indexDocImmediately(fullURL, title, keywords, desc, rank)
|
||||
if err != nil {
|
||||
printErr("Index error for %s: %v", fullURL, err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
@ -156,6 +157,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
wgChrome.Add(1)
|
||||
go func() {
|
||||
defer wgChrome.Done()
|
||||
|
||||
for dom := range chromeCh {
|
||||
rank := dom[0]
|
||||
domainName := dom[1]
|
||||
|
@ -164,28 +166,19 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
}
|
||||
fullURL := "https://" + domainName
|
||||
|
||||
// We already marked it visited in the standard pass
|
||||
// but you may re-check if you prefer:
|
||||
//
|
||||
// added, err := visitedStore.MarkVisited(fullURL)
|
||||
// if err != nil { ... }
|
||||
// if !added { continue }
|
||||
|
||||
// 3. Chromedp fallback extraction
|
||||
userAgent, _ := GetUserAgent("crawler-chrome")
|
||||
title, desc, keywords := fetchPageMetadataChrome(fullURL, userAgent)
|
||||
if title == "" || desc == "" {
|
||||
printWarn("Skipping (Chrome) %s: missing title/desc", fullURL)
|
||||
printWarn("Skipping %s: unable to get title/desc data", fullURL)
|
||||
continue
|
||||
}
|
||||
|
||||
// 4. Write to file
|
||||
line := fmt.Sprintf("%s|%s|%s|%s|%s\n",
|
||||
fullURL, title, keywords, desc, rank)
|
||||
|
||||
mu.Lock()
|
||||
_, _ = file.WriteString(line)
|
||||
mu.Unlock()
|
||||
// 4. Directly index the doc
|
||||
err := indexDocImmediately(fullURL, title, keywords, desc, rank)
|
||||
if err != nil {
|
||||
printErr("Index error for %s: %v", fullURL, err)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
@ -195,7 +188,6 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
for _, dom := range domains {
|
||||
standardCh <- dom
|
||||
}
|
||||
// close the standardCh once all are queued
|
||||
close(standardCh)
|
||||
}()
|
||||
|
||||
|
@ -208,7 +200,7 @@ func crawlDomainsToFile(domains [][2]string, outFile string, maxPages int) error
|
|||
// Wait for chrome workers to finish
|
||||
wgChrome.Wait()
|
||||
|
||||
// Optionally flush the visited store once more
|
||||
// Flush visitedStore
|
||||
if visitedStore != nil {
|
||||
if err := visitedStore.Flush(); err != nil {
|
||||
printErr("visitedStore flush error: %v", err)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue