2024-02-20 15:03:56 +01:00

121 lines
3.3 KiB
Go

package tos_scraper
import (
"strings"
"time"
"github.com/rs/zerolog/log"
"pluja.dev/kycnot.me/config"
"pluja.dev/kycnot.me/database"
"pluja.dev/kycnot.me/utils/ai"
)
func InitTosScraperDaemon() {
if !config.Conf.Scraper {
log.Warn().Msg("Scraper is disabled")
return
}
go func() {
// Calculate the duration until the first day of the next month
nextMonth := time.Now().AddDate(0, 1, 0)
firstDayNextMonth := time.Date(nextMonth.Year(), nextMonth.Month(), 1, 0, 0, 0, 0, nextMonth.Location())
duration := time.Until(firstDayNextMonth)
if config.Conf.Dev {
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Running scraper.")
trigerScraping()
}
for {
if config.Conf.Dev {
//duration = 1 * time.Hour
log.Debug().Bool("DevMode", config.Conf.Dev).Msgf("Next scraping in %v", duration)
}
// Set the ticker for that duration
ticker := time.NewTicker(duration)
// Wait for the ticker to tick
<-ticker.C
// Stop the ticker before resetting it
ticker.Stop()
// Add your code here to run on the first day of every month
if config.Conf.Dev {
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Will not run scraper again.")
} else {
trigerScraping()
}
// Reset ticker for next month
firstDayNextMonth = firstDayNextMonth.AddDate(0, 1, 0)
duration = time.Until(firstDayNextMonth)
log.Info().Msgf("Next scraping in %v, at %v", duration, firstDayNextMonth)
ticker.Reset(duration)
}
}()
}
func trigerScraping() {
log.Debug().Msg("Starting scraper...")
// Get all the Services from the DB
services, err := database.Pb.GetServices("pending=false && listed=true", "")
if err != nil {
log.Error().Err(err).Msg("Could not get services from DB")
return
}
log.Debug().Msgf("Found %v services", len(services))
// For each service, run the scraper
for _, service := range services {
log.Debug().Str("Name", service.Name).Msgf("Scraping ToS")
if len(service.TosUrls) == 0 {
log.Error().Msgf("Service %v has no ToS URL", service.Name)
continue
}
html, err := GetBody(service.TosUrls[0])
if err != nil {
log.Error().Err(err).Msgf("Could not get HTML from %v", service.TosUrls[0])
continue
}
// Check if service.LastTosReview is not older than one month, skip
if service.LastTosReview != "" {
log.Debug().Msgf("Last review for %s: %s", service.Name, service.LastTosReview)
t, _ := time.Parse("2006-01-02 15:04:05.000Z", service.LastTosReview)
if time.Since(t) < 30*24*time.Hour {
log.Debug().Msgf("Service %v ToS review is not older than one month, skipping", service.Name)
continue
}
}
html = strings.ReplaceAll(html, "\n", " ")
html = strings.ReplaceAll(html, "\t", " ")
html = strings.ReplaceAll(html, "\r", " ")
html = strings.TrimSpace(html)
highlights, err := ai.GetTosReviews(html)
if err != nil {
log.Error().Err(err).Msgf("Could not parse ToS from %v", service.TosUrls[0])
continue
}
log.Debug().Msgf("Found %v highlights", len(highlights))
service.TosReviews = highlights
service.LastTosReview = time.Now().Format("2006-01-02 15:04:05.000Z")
err = database.Pb.UpdateService(service.ID, service)
if err != nil {
log.Error().Err(err).Msgf("Could not save ToS highlights from %v", service.TosUrls[0])
continue
}
// Sleep 1 second
time.Sleep(1 * time.Second)
}
log.Info().Msg("Finished scraping ToS")
}