package main import ( "bufio" "fmt" "os" "sync" ) // VisitedStore handles deduplicating visited URLs with a map and a periodic flush to disk. type VisitedStore struct { mu sync.Mutex visited map[string]bool toFlush []string filePath string batchSize int // how many new URLs we batch before flushing } // NewVisitedStore creates or loads the visited URLs from filePath. func NewVisitedStore(filePath string, batchSize int) (*VisitedStore, error) { store := &VisitedStore{ visited: make(map[string]bool), filePath: filePath, batchSize: batchSize, } // Attempt to load existing visited URLs (if file exists). if _, err := os.Stat(filePath); err == nil { if err := store.loadFromFile(); err != nil { return nil, fmt.Errorf("loadFromFile error: %w", err) } } return store, nil } // loadFromFile loads visited URLs from the store’s file. One URL per line. func (s *VisitedStore) loadFromFile() error { f, err := os.Open(s.filePath) if err != nil { return err } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { url := scanner.Text() s.visited[url] = true } return scanner.Err() } // AlreadyVisited returns true if the URL is in the store. func (s *VisitedStore) AlreadyVisited(url string) bool { s.mu.Lock() defer s.mu.Unlock() return s.visited[url] } // MarkVisited adds the URL to the store if not already present, and triggers a flush if batchSize is reached. func (s *VisitedStore) MarkVisited(url string) (added bool, err error) { s.mu.Lock() defer s.mu.Unlock() if s.visited[url] { return false, nil } // Mark in memory s.visited[url] = true s.toFlush = append(s.toFlush, url) // Flush if we have enough new URLs if len(s.toFlush) >= s.batchSize { if err := s.flushToFileUnlocked(); err != nil { return false, err } } return true, nil } // Flush everything in s.toFlush to file, then clear the buffer. func (s *VisitedStore) Flush() error { s.mu.Lock() defer s.mu.Unlock() return s.flushToFileUnlocked() } // flushToFileUnlocked writes s.toFlush lines to the store file, then clears s.toFlush. func (s *VisitedStore) flushToFileUnlocked() error { if len(s.toFlush) == 0 { return nil } f, err := os.OpenFile(s.filePath, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) if err != nil { return err } defer f.Close() for _, url := range s.toFlush { if _, err := fmt.Fprintln(f, url); err != nil { return err } } s.toFlush = nil return nil }