107 lines
2.4 KiB
Go
107 lines
2.4 KiB
Go
|
package main
|
|||
|
|
|||
|
import (
|
|||
|
"bufio"
|
|||
|
"fmt"
|
|||
|
"os"
|
|||
|
"sync"
|
|||
|
)
|
|||
|
|
|||
|
// VisitedStore handles deduplicating visited URLs with a map and a periodic flush to disk.
|
|||
|
type VisitedStore struct {
|
|||
|
mu sync.Mutex
|
|||
|
visited map[string]bool
|
|||
|
toFlush []string
|
|||
|
|
|||
|
filePath string
|
|||
|
batchSize int // how many new URLs we batch before flushing
|
|||
|
}
|
|||
|
|
|||
|
// NewVisitedStore creates or loads the visited URLs from filePath.
|
|||
|
func NewVisitedStore(filePath string, batchSize int) (*VisitedStore, error) {
|
|||
|
store := &VisitedStore{
|
|||
|
visited: make(map[string]bool),
|
|||
|
filePath: filePath,
|
|||
|
batchSize: batchSize,
|
|||
|
}
|
|||
|
|
|||
|
// Attempt to load existing visited URLs (if file exists).
|
|||
|
if _, err := os.Stat(filePath); err == nil {
|
|||
|
if err := store.loadFromFile(); err != nil {
|
|||
|
return nil, fmt.Errorf("loadFromFile error: %w", err)
|
|||
|
}
|
|||
|
}
|
|||
|
return store, nil
|
|||
|
}
|
|||
|
|
|||
|
// loadFromFile loads visited URLs from the store’s file. One URL per line.
|
|||
|
func (s *VisitedStore) loadFromFile() error {
|
|||
|
f, err := os.Open(s.filePath)
|
|||
|
if err != nil {
|
|||
|
return err
|
|||
|
}
|
|||
|
defer f.Close()
|
|||
|
|
|||
|
scanner := bufio.NewScanner(f)
|
|||
|
for scanner.Scan() {
|
|||
|
url := scanner.Text()
|
|||
|
s.visited[url] = true
|
|||
|
}
|
|||
|
return scanner.Err()
|
|||
|
}
|
|||
|
|
|||
|
// AlreadyVisited returns true if the URL is in the store.
|
|||
|
func (s *VisitedStore) AlreadyVisited(url string) bool {
|
|||
|
s.mu.Lock()
|
|||
|
defer s.mu.Unlock()
|
|||
|
return s.visited[url]
|
|||
|
}
|
|||
|
|
|||
|
// MarkVisited adds the URL to the store if not already present, and triggers a flush if batchSize is reached.
|
|||
|
func (s *VisitedStore) MarkVisited(url string) (added bool, err error) {
|
|||
|
s.mu.Lock()
|
|||
|
defer s.mu.Unlock()
|
|||
|
|
|||
|
if s.visited[url] {
|
|||
|
return false, nil
|
|||
|
}
|
|||
|
// Mark in memory
|
|||
|
s.visited[url] = true
|
|||
|
s.toFlush = append(s.toFlush, url)
|
|||
|
|
|||
|
// Flush if we have enough new URLs
|
|||
|
if len(s.toFlush) >= s.batchSize {
|
|||
|
if err := s.flushToFileUnlocked(); err != nil {
|
|||
|
return false, err
|
|||
|
}
|
|||
|
}
|
|||
|
return true, nil
|
|||
|
}
|
|||
|
|
|||
|
// Flush everything in s.toFlush to file, then clear the buffer.
|
|||
|
func (s *VisitedStore) Flush() error {
|
|||
|
s.mu.Lock()
|
|||
|
defer s.mu.Unlock()
|
|||
|
return s.flushToFileUnlocked()
|
|||
|
}
|
|||
|
|
|||
|
// flushToFileUnlocked writes s.toFlush lines to the store file, then clears s.toFlush.
|
|||
|
func (s *VisitedStore) flushToFileUnlocked() error {
|
|||
|
if len(s.toFlush) == 0 {
|
|||
|
return nil
|
|||
|
}
|
|||
|
f, err := os.OpenFile(s.filePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
|
|||
|
if err != nil {
|
|||
|
return err
|
|||
|
}
|
|||
|
defer f.Close()
|
|||
|
|
|||
|
for _, url := range s.toFlush {
|
|||
|
if _, err := fmt.Fprintln(f, url); err != nil {
|
|||
|
return err
|
|||
|
}
|
|||
|
}
|
|||
|
s.toFlush = nil
|
|||
|
return nil
|
|||
|
}
|