mirror of
https://codeberg.org/pluja/kycnot.me
synced 2025-03-12 12:36:28 -04:00
121 lines
3.3 KiB
Go
121 lines
3.3 KiB
Go
package tos_scraper
|
|
|
|
import (
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
|
|
"pluja.dev/kycnot.me/config"
|
|
"pluja.dev/kycnot.me/database"
|
|
"pluja.dev/kycnot.me/utils/ai"
|
|
)
|
|
|
|
func InitTosScraperDaemon() {
|
|
if !config.Conf.Scraper {
|
|
log.Warn().Msg("Scraper is disabled")
|
|
return
|
|
}
|
|
go func() {
|
|
// Calculate the duration until the first day of the next month
|
|
nextMonth := time.Now().AddDate(0, 1, 0)
|
|
firstDayNextMonth := time.Date(nextMonth.Year(), nextMonth.Month(), 1, 0, 0, 0, 0, nextMonth.Location())
|
|
duration := time.Until(firstDayNextMonth)
|
|
if config.Conf.Dev {
|
|
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Running scraper.")
|
|
trigerScraping()
|
|
}
|
|
for {
|
|
|
|
if config.Conf.Dev {
|
|
//duration = 1 * time.Hour
|
|
log.Debug().Bool("DevMode", config.Conf.Dev).Msgf("Next scraping in %v", duration)
|
|
}
|
|
|
|
// Set the ticker for that duration
|
|
ticker := time.NewTicker(duration)
|
|
|
|
// Wait for the ticker to tick
|
|
<-ticker.C
|
|
|
|
// Stop the ticker before resetting it
|
|
ticker.Stop()
|
|
|
|
// Add your code here to run on the first day of every month
|
|
if config.Conf.Dev {
|
|
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Will not run scraper again.")
|
|
} else {
|
|
trigerScraping()
|
|
}
|
|
|
|
// Reset ticker for next month
|
|
firstDayNextMonth = firstDayNextMonth.AddDate(0, 1, 0)
|
|
duration = time.Until(firstDayNextMonth)
|
|
log.Info().Msgf("Next scraping in %v, at %v", duration, firstDayNextMonth)
|
|
ticker.Reset(duration)
|
|
}
|
|
}()
|
|
}
|
|
|
|
func trigerScraping() {
|
|
log.Debug().Msg("Starting scraper...")
|
|
// Get all the Services from the DB
|
|
services, err := database.Pb.GetServices("pending=false && listed=true", "")
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Could not get services from DB")
|
|
return
|
|
}
|
|
|
|
log.Debug().Msgf("Found %v services", len(services))
|
|
|
|
// For each service, run the scraper
|
|
for _, service := range services {
|
|
log.Debug().Str("Name", service.Name).Msgf("Scraping ToS")
|
|
if len(service.TosUrls) == 0 {
|
|
log.Error().Msgf("Service %v has no ToS URL", service.Name)
|
|
continue
|
|
}
|
|
|
|
html, err := GetBody(service.TosUrls[0])
|
|
if err != nil {
|
|
log.Error().Err(err).Msgf("Could not get HTML from %v", service.TosUrls[0])
|
|
continue
|
|
}
|
|
|
|
// Check if service.LastTosReview is not older than one month, skip
|
|
if service.LastTosReview != "" {
|
|
log.Debug().Msgf("Last review for %s: %s", service.Name, service.LastTosReview)
|
|
t, _ := time.Parse("2006-01-02 15:04:05.000Z", service.LastTosReview)
|
|
if time.Since(t) < 30*24*time.Hour {
|
|
log.Debug().Msgf("Service %v ToS review is not older than one month, skipping", service.Name)
|
|
continue
|
|
}
|
|
}
|
|
|
|
html = strings.ReplaceAll(html, "\n", " ")
|
|
html = strings.ReplaceAll(html, "\t", " ")
|
|
html = strings.ReplaceAll(html, "\r", " ")
|
|
html = strings.TrimSpace(html)
|
|
|
|
highlights, err := ai.GetTosReviews(html)
|
|
if err != nil {
|
|
log.Error().Err(err).Msgf("Could not parse ToS from %v", service.TosUrls[0])
|
|
continue
|
|
}
|
|
|
|
log.Debug().Msgf("Found %v highlights", len(highlights))
|
|
service.TosReviews = highlights
|
|
service.LastTosReview = time.Now().Format("2006-01-02 15:04:05.000Z")
|
|
err = database.Pb.UpdateService(service.ID, service)
|
|
if err != nil {
|
|
log.Error().Err(err).Msgf("Could not save ToS highlights from %v", service.TosUrls[0])
|
|
continue
|
|
}
|
|
|
|
// Sleep 1 second
|
|
time.Sleep(1 * time.Second)
|
|
}
|
|
|
|
log.Info().Msg("Finished scraping ToS")
|
|
}
|