updated indexing & user agent generator
This commit is contained in:
parent
13e1d6119b
commit
a9a6948a44
3 changed files with 73 additions and 35 deletions
43
agent.go
43
agent.go
|
@ -3,7 +3,7 @@ package main
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"net/http"
|
"net/http"
|
||||||
"sort"
|
"sort"
|
||||||
|
@ -40,13 +40,33 @@ var (
|
||||||
func fetchLatestBrowserVersions() (BrowserData, error) {
|
func fetchLatestBrowserVersions() (BrowserData, error) {
|
||||||
url := "https://raw.githubusercontent.com/Fyrd/caniuse/master/fulldata-json/data-2.0.json"
|
url := "https://raw.githubusercontent.com/Fyrd/caniuse/master/fulldata-json/data-2.0.json"
|
||||||
|
|
||||||
resp, err := http.Get(url)
|
// // Optional: skip TLS verification to avoid certificate errors
|
||||||
|
// transport := &http.Transport{
|
||||||
|
// TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Increase the HTTP client timeout
|
||||||
|
client := &http.Client{
|
||||||
|
Timeout: 30 * time.Second,
|
||||||
|
// Transport: transport,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the request manually to set headers
|
||||||
|
req, err := http.NewRequest("GET", url, nil)
|
||||||
|
if err != nil {
|
||||||
|
return BrowserData{}, err
|
||||||
|
}
|
||||||
|
// Custom user agent and English language preference
|
||||||
|
req.Header.Set("User-Agent", "MyCustomAgent/1.0 (compatible; +https://example.com)")
|
||||||
|
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||||
|
|
||||||
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return BrowserData{}, err
|
return BrowserData{}, err
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
body, err := ioutil.ReadAll(resp.Body)
|
body, err := io.ReadAll(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return BrowserData{}, err
|
return BrowserData{}, err
|
||||||
}
|
}
|
||||||
|
@ -109,7 +129,7 @@ func randomUserAgent() (string, error) {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
rand.Seed(time.Now().UnixNano())
|
rand := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||||
|
|
||||||
// Simulated browser usage statistics (in percentages)
|
// Simulated browser usage statistics (in percentages)
|
||||||
usageStats := map[string]float64{
|
usageStats := map[string]float64{
|
||||||
|
@ -161,6 +181,7 @@ func randomUserAgent() (string, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fallback to the last version if none matched
|
||||||
if version == "" {
|
if version == "" {
|
||||||
version = versions[len(versions)-1].Version
|
version = versions[len(versions)-1].Version
|
||||||
}
|
}
|
||||||
|
@ -240,11 +261,11 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string {
|
||||||
browserType = "Firefox"
|
browserType = "Firefox"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the latest version for the browser type
|
// Get the latest version for that browser
|
||||||
var latestVersion string
|
var latestVersion string
|
||||||
if browserType == "Firefox" {
|
if browserType == "Firefox" && len(newVersions.Firefox) > 0 {
|
||||||
latestVersion = newVersions.Firefox[0].Version
|
latestVersion = newVersions.Firefox[0].Version
|
||||||
} else if browserType == "Chromium" {
|
} else if browserType == "Chromium" && len(newVersions.Chromium) > 0 {
|
||||||
latestVersion = newVersions.Chromium[0].Version
|
latestVersion = newVersions.Chromium[0].Version
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -252,7 +273,7 @@ func updateUserAgentVersion(userAgent string, newVersions BrowserData) string {
|
||||||
return generateUserAgent(browserType, latestVersion)
|
return generateUserAgent(browserType, latestVersion)
|
||||||
}
|
}
|
||||||
|
|
||||||
func periodicUpdate() {
|
func periodicAgentUpdate() {
|
||||||
for {
|
for {
|
||||||
// Sleep for a random interval between 1 and 2 days
|
// Sleep for a random interval between 1 and 2 days
|
||||||
time.Sleep(time.Duration(24+rand.Intn(24)) * time.Hour)
|
time.Sleep(time.Duration(24+rand.Intn(24)) * time.Hour)
|
||||||
|
@ -309,12 +330,8 @@ func GetNewUserAgent(cacheKey string) (string, error) {
|
||||||
return userAgent, nil
|
return userAgent, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func init() {
|
|
||||||
go periodicUpdate()
|
|
||||||
}
|
|
||||||
|
|
||||||
// func main() {
|
// func main() {
|
||||||
// go periodicUpdate() // not needed here
|
// go periodicAgentUpdate() // not needed here
|
||||||
|
|
||||||
// cacheKey := "image-search"
|
// cacheKey := "image-search"
|
||||||
// userAgent, err := GetUserAgent(cacheKey)
|
// userAgent, err := GetUserAgent(cacheKey)
|
||||||
|
|
60
indexer.go
60
indexer.go
|
@ -3,6 +3,7 @@ package main
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
@ -10,10 +11,10 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/v2"
|
"github.com/blevesearch/bleve/v2"
|
||||||
|
"golang.org/x/net/publicsuffix"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Document represents a single document to be indexed.
|
// Document represents a single document to be indexed.
|
||||||
// You can add more fields if needed.
|
|
||||||
type Document struct {
|
type Document struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Link string `json:"link"`
|
Link string `json:"link"`
|
||||||
|
@ -48,16 +49,20 @@ func InitIndex() error {
|
||||||
// Index doesn't exist, create a new one
|
// Index doesn't exist, create a new one
|
||||||
mapping := bleve.NewIndexMapping()
|
mapping := bleve.NewIndexMapping()
|
||||||
|
|
||||||
// Custom mapping for the document
|
|
||||||
docMapping := bleve.NewDocumentMapping()
|
docMapping := bleve.NewDocumentMapping()
|
||||||
|
|
||||||
// Text fields with custom analyzers for better tokenization
|
// Text fields
|
||||||
textFieldMapping := bleve.NewTextFieldMapping()
|
titleFieldMapping := bleve.NewTextFieldMapping()
|
||||||
textFieldMapping.Analyzer = "standard" // Use standard analyzer for partial and fuzzy matches
|
titleFieldMapping.Analyzer = "standard"
|
||||||
|
docMapping.AddFieldMappingsAt("title", titleFieldMapping)
|
||||||
|
|
||||||
docMapping.AddFieldMappingsAt("title", textFieldMapping)
|
descFieldMapping := bleve.NewTextFieldMapping()
|
||||||
docMapping.AddFieldMappingsAt("description", textFieldMapping)
|
descFieldMapping.Analyzer = "standard"
|
||||||
docMapping.AddFieldMappingsAt("tags", textFieldMapping)
|
docMapping.AddFieldMappingsAt("description", descFieldMapping)
|
||||||
|
|
||||||
|
tagFieldMapping := bleve.NewTextFieldMapping()
|
||||||
|
tagFieldMapping.Analyzer = "standard"
|
||||||
|
docMapping.AddFieldMappingsAt("tags", tagFieldMapping)
|
||||||
|
|
||||||
// Numeric field for popularity
|
// Numeric field for popularity
|
||||||
popularityMapping := bleve.NewNumericFieldMapping()
|
popularityMapping := bleve.NewNumericFieldMapping()
|
||||||
|
@ -77,8 +82,19 @@ func InitIndex() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func normalizeDomain(rawURL string) string {
|
||||||
|
parsed, err := url.Parse(rawURL)
|
||||||
|
if err != nil {
|
||||||
|
return rawURL
|
||||||
|
}
|
||||||
|
domain, err := publicsuffix.EffectiveTLDPlusOne(parsed.Hostname())
|
||||||
|
if err != nil {
|
||||||
|
return parsed.Hostname() // fallback
|
||||||
|
}
|
||||||
|
return domain
|
||||||
|
}
|
||||||
|
|
||||||
// IndexFile reads a file line-by-line and indexes each line as a document.
|
// IndexFile reads a file line-by-line and indexes each line as a document.
|
||||||
// Each line represents a simple document. Adjust parsing as needed.
|
|
||||||
func IndexFile(filePath string) error {
|
func IndexFile(filePath string) error {
|
||||||
file, err := os.Open(filePath)
|
file, err := os.Open(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -88,27 +104,29 @@ func IndexFile(filePath string) error {
|
||||||
|
|
||||||
scanner := bufio.NewScanner(file)
|
scanner := bufio.NewScanner(file)
|
||||||
batch := bleveIndex.NewBatch()
|
batch := bleveIndex.NewBatch()
|
||||||
indexedDomains := make(map[string]bool) // Track indexed domains
|
|
||||||
|
// Map to track normalized domains we’ve already indexed
|
||||||
|
indexedDomains := make(map[string]bool)
|
||||||
|
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
line := scanner.Text()
|
line := scanner.Text()
|
||||||
|
|
||||||
// Split the line into 5 fields: link|title|tags|description|popularity
|
// link|title|tags|description|popularity
|
||||||
parts := strings.SplitN(line, "|", 5)
|
parts := strings.SplitN(line, "|", 5)
|
||||||
if len(parts) < 5 {
|
if len(parts) < 5 {
|
||||||
continue // Skip malformed lines
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
domain := parts[0]
|
// Normalize domain part so duplicates share the same “key”
|
||||||
|
normalized := normalizeDomain(parts[0])
|
||||||
popularity, _ := strconv.ParseInt(parts[4], 10, 64)
|
popularity, _ := strconv.ParseInt(parts[4], 10, 64)
|
||||||
|
|
||||||
// Skip if the domain is already indexed
|
if indexedDomains[normalized] {
|
||||||
if indexedDomains[domain] {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
doc := Document{
|
doc := Document{
|
||||||
ID: domain, // Use the domain as the unique ID
|
ID: normalized,
|
||||||
Link: parts[0],
|
Link: parts[0],
|
||||||
Title: parts[1],
|
Title: parts[1],
|
||||||
Tags: parts[2],
|
Tags: parts[2],
|
||||||
|
@ -127,10 +145,9 @@ func IndexFile(filePath string) error {
|
||||||
return fmt.Errorf("failed to index document: %v", err)
|
return fmt.Errorf("failed to index document: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
indexedDomains[domain] = true // Mark the domain as indexed
|
indexedDomains[normalized] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
// Commit the batch
|
|
||||||
if err := bleveIndex.Batch(batch); err != nil {
|
if err := bleveIndex.Batch(batch); err != nil {
|
||||||
return fmt.Errorf("error committing batch: %v", err)
|
return fmt.Errorf("error committing batch: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -139,13 +156,12 @@ func IndexFile(filePath string) error {
|
||||||
return fmt.Errorf("error reading file: %v", err)
|
return fmt.Errorf("error reading file: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
printDebug("Indexed %d unique domains from %s", len(indexedDomains), filePath)
|
printDebug("Indexed %d unique normalized domains from %s", len(indexedDomains), filePath)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// SearchIndex performs a full-text search on the indexed data.
|
// SearchIndex performs a full-text search on the indexed data.
|
||||||
func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
|
func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
|
||||||
// Create compound query
|
|
||||||
exactMatch := bleve.NewMatchQuery(queryStr) // Exact match
|
exactMatch := bleve.NewMatchQuery(queryStr) // Exact match
|
||||||
fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match
|
fuzzyMatch := bleve.NewFuzzyQuery(queryStr) // Fuzzy match
|
||||||
fuzzyMatch.Fuzziness = 2
|
fuzzyMatch.Fuzziness = 2
|
||||||
|
@ -160,8 +176,8 @@ func SearchIndex(queryStr string, page, pageSize int) ([]Document, error) {
|
||||||
req.Size = pageSize
|
req.Size = pageSize
|
||||||
req.From = (page - 1) * pageSize
|
req.From = (page - 1) * pageSize
|
||||||
|
|
||||||
// Sort by popularity
|
// Sort primarily by relevance (score), then by popularity descending
|
||||||
req.SortBy([]string{"popularity"})
|
req.SortBy([]string{"-_score", "-popularity"})
|
||||||
|
|
||||||
res, err := bleveIndex.Search(req)
|
res, err := bleveIndex.Search(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
5
init.go
5
init.go
|
@ -61,6 +61,11 @@ func main() {
|
||||||
}
|
}
|
||||||
config.PeerID = hostID
|
config.PeerID = hostID
|
||||||
|
|
||||||
|
// Initiate Browser Agent updater
|
||||||
|
if config.CrawlerEnabled || config.IndexerEnabled {
|
||||||
|
go periodicAgentUpdate()
|
||||||
|
}
|
||||||
|
|
||||||
InitializeLanguage("en") // Initialize language before generating OpenSearch
|
InitializeLanguage("en") // Initialize language before generating OpenSearch
|
||||||
generateOpenSearchXML(config)
|
generateOpenSearchXML(config)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue