added website crawling and indexing crawled results

This commit is contained in:
partisan 2024-12-29 22:54:55 +01:00
parent 5b90a372a1
commit 047cccd19f
10 changed files with 819 additions and 97 deletions

View file

@ -24,15 +24,15 @@ import (
)
var (
cachingImages = make(map[string]*sync.Mutex)
cachingImagesMu sync.Mutex
// cachingSemaphore = make(chan struct{}, 100) // Limit to concurrent downloads
cachingImages = make(map[string]*sync.Mutex)
cachingImagesMu sync.Mutex
cachingSemaphore = make(chan struct{}, 100)
invalidImageIDs = make(map[string]struct{})
invalidImageIDsMu sync.Mutex
imageURLMap = make(map[string]string) // mapping from imageID_type to imageURL
imageURLMapMu sync.RWMutex // mutex for thread-safe access
imageURLMap = make(map[string]string)
imageURLMapMu sync.RWMutex
)
func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error) {
@ -49,7 +49,13 @@ func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error
filename = fmt.Sprintf("%s_full.webp", imageID)
}
cachedImagePath := filepath.Join(config.DriveCache.Path, filename)
// Make sure we store inside: config.DriveCache.Path / images
imageCacheDir := filepath.Join(config.DriveCache.Path, "images")
if err := os.MkdirAll(imageCacheDir, 0755); err != nil {
return "", false, fmt.Errorf("couldn't create images folder: %v", err)
}
cachedImagePath := filepath.Join(imageCacheDir, filename)
tempImagePath := cachedImagePath + ".tmp"
// Check if the image is already cached
@ -73,9 +79,8 @@ func cacheImage(imageURL, imageID string, isThumbnail bool) (string, bool, error
return cachedImagePath, true, nil
}
// // Limit max concurrent downloads
// cachingSemaphore <- struct{}{} // Acquire a token
// defer func() { <-cachingSemaphore }() // Release the token
cachingSemaphore <- struct{}{}
defer func() { <-cachingSemaphore }()
// Create a custom http.Client that skips SSL certificate verification
client := &http.Client{
@ -217,7 +222,8 @@ func handleImageServe(w http.ResponseWriter, r *http.Request) {
imageType = parts[1]
filename := fmt.Sprintf("%s_%s.webp", imageID, imageType)
cachedImagePath := filepath.Join(config.DriveCache.Path, filename)
// Adjust to read from config.DriveCache.Path / images
cachedImagePath := filepath.Join(config.DriveCache.Path, "images", filename)
if hasExtension && imageType == "thumb" {
// Requesting cached image (thumbnail or full)
@ -329,7 +335,7 @@ func handleImageStatus(w http.ResponseWriter, r *http.Request) {
// Check thumbnail first
for _, ext := range extensions {
thumbFilename := fmt.Sprintf("%s_thumb.%s", id, ext)
thumbPath := filepath.Join(config.DriveCache.Path, thumbFilename)
thumbPath := filepath.Join(config.DriveCache.Path, "images", thumbFilename)
if _, err := os.Stat(thumbPath); err == nil {
statusMap[id] = fmt.Sprintf("/image/%s_thumb.%s", id, ext)
@ -342,7 +348,7 @@ func handleImageStatus(w http.ResponseWriter, r *http.Request) {
if !imageReady {
for _, ext := range extensions {
fullFilename := fmt.Sprintf("%s_full.%s", id, ext)
fullPath := filepath.Join(config.DriveCache.Path, fullFilename)
fullPath := filepath.Join(config.DriveCache.Path, "images", fullFilename)
if _, err := os.Stat(fullPath); err == nil {
statusMap[id] = fmt.Sprintf("/image/%s_full.%s", id, ext)
@ -447,7 +453,9 @@ func cleanExpiredCachedImages() {
}
func cleanupCache() {
files, err := os.ReadDir(config.DriveCache.Path)
// Read from: config.DriveCache.Path / images
imageCacheDir := filepath.Join(config.DriveCache.Path, "images")
files, err := os.ReadDir(imageCacheDir)
if err != nil {
printErr("Failed to read DriveCache directory: %v", err)
return
@ -462,19 +470,17 @@ func cleanupCache() {
continue
}
filePath := filepath.Join(config.DriveCache.Path, file.Name())
filePath := filepath.Join(imageCacheDir, file.Name())
// Check for expired files based on modification time
if config.DriveCache.Duration > 0 && time.Since(info.ModTime()) > config.DriveCache.Duration {
if err := os.Remove(filePath); err == nil {
printDebug("Removed expired cache file: %s", filePath)
} else {
printErr("Failed to remove expired cache file: %s", filePath)
}
continue // Skip adding this file to the list
continue
}
// Accumulate total size and store file info for potential deletion
totalSize += uint64(info.Size())
fileInfos = append(fileInfos, info)
}
@ -491,7 +497,7 @@ func cleanupCache() {
break
}
filePath := filepath.Join(config.DriveCache.Path, info.Name())
filePath := filepath.Join(imageCacheDir, info.Name())
fileSize := uint64(info.Size())
if err := os.Remove(filePath); err == nil {