added website crawling and indexing crawled results
This commit is contained in:
parent
5b90a372a1
commit
047cccd19f
10 changed files with 819 additions and 97 deletions
118
get-domains-csv.go
Normal file
118
get-domains-csv.go
Normal file
|
@ -0,0 +1,118 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func downloadAndSetupDomainsCSV() error {
|
||||
targetFilePath := filepath.Join(config.DriveCache.Path, "domains.csv")
|
||||
|
||||
// Check if domains.csv already exists
|
||||
if _, err := os.Stat(targetFilePath); err == nil {
|
||||
printDebug("domains.csv already exists at %s", targetFilePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
downloadURL := "https://www.domcop.com/files/top/top10milliondomains.csv.zip"
|
||||
zipFilePath := filepath.Join(config.DriveCache.Path, "top10milliondomains.csv.zip")
|
||||
|
||||
// Download the file
|
||||
printDebug("Downloading file from %s", downloadURL)
|
||||
resp, err := http.Get(downloadURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to download file: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("failed to download file: received status code %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Create the zip file locally
|
||||
zipFile, err := os.Create(zipFilePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create local zip file: %v", err)
|
||||
}
|
||||
defer zipFile.Close()
|
||||
|
||||
_, err = io.Copy(zipFile, resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to write downloaded zip file: %v", err)
|
||||
}
|
||||
|
||||
// Unzip the file
|
||||
printDebug("Unzipping file %s", zipFilePath)
|
||||
if err := unzipFile(zipFilePath, config.DriveCache.Path); err != nil {
|
||||
return fmt.Errorf("failed to unzip file: %v", err)
|
||||
}
|
||||
|
||||
// Find the .csv file and rename/move it to domains.csv
|
||||
csvFound := false
|
||||
dirEntries, err := os.ReadDir(config.DriveCache.Path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read directory: %v", err)
|
||||
}
|
||||
|
||||
for _, entry := range dirEntries {
|
||||
if !entry.IsDir() && filepath.Ext(entry.Name()) == ".csv" {
|
||||
csvPath := filepath.Join(config.DriveCache.Path, entry.Name())
|
||||
if err := os.Rename(csvPath, targetFilePath); err != nil {
|
||||
return fmt.Errorf("failed to move %s to %s: %v", csvPath, targetFilePath, err)
|
||||
}
|
||||
csvFound = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !csvFound {
|
||||
return fmt.Errorf("no .csv file found in the downloaded archive")
|
||||
}
|
||||
|
||||
// Clean up zip file
|
||||
if err := os.Remove(zipFilePath); err != nil {
|
||||
printWarn("failed to remove zip file %s: %v", zipFilePath, err)
|
||||
}
|
||||
|
||||
printDebug("domains.csv successfully downloaded and placed at %s", targetFilePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
func unzipFile(zipFile, destDir string) error {
|
||||
reader, err := zip.OpenReader(zipFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
for _, file := range reader.File {
|
||||
filePath := filepath.Join(destDir, file.Name)
|
||||
|
||||
if file.FileInfo().IsDir() {
|
||||
os.MkdirAll(filePath, os.ModePerm)
|
||||
continue
|
||||
}
|
||||
|
||||
srcFile, err := file.Open()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer srcFile.Close()
|
||||
|
||||
destFile, err := os.Create(filePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer destFile.Close()
|
||||
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue