update scraper to run with jina and claude

This commit is contained in:
pluja 2024-04-22 12:10:09 +02:00
parent 69b6e8bf3b
commit 717d56d747
7 changed files with 277 additions and 377 deletions

View File

@ -29,7 +29,6 @@ func init() {
config.Conf = config.Config{
Dev: *dev,
Cache: *nocache,
Scraper: *scraper,
ListenAddr: *listen,
}
@ -68,10 +67,9 @@ func init() {
log.Info().Msg("Initializing database.")
database.InitPocketbase()
// AI ToS Scraper init
if config.Conf.Scraper {
log.Info().Msg("Initializing AI ToS scraper.")
tos_scraper.InitTosScraperDaemon()
if *scraper {
tos_scraper.TrigerScraping()
os.Exit(0)
}
// Maintenance init

126
src/utils/ai/claude.go Normal file
View File

@ -0,0 +1,126 @@
package ai
import (
"bytes"
"encoding/json"
"errors"
"io"
"net/http"
"os"
"github.com/rs/zerolog/log"
"pluja.dev/kycnot.me/database"
"pluja.dev/kycnot.me/utils"
)
type AnthropicMessage struct {
Role string `json:"role"`
Content string `json:"content"`
}
type AnthropicMessagePartResponse struct {
Type string `json:"type"`
Text string `json:"text"`
// Optional fields, only present for tools responses
ID string `json:"id,omitempty"`
Name string `json:"name,omitempty"`
Input map[string]interface{} `json:"input,omitempty"`
}
type AnthropicMessageResponse struct {
ID string `json:"id"`
Type string `json:"type"`
Model string `json:"model"`
Role string `json:"role"`
Content []AnthropicMessagePartResponse `json:"content"`
StopReason string `json:"stop_reason"`
Stop string `json:"stop"`
StopSequence string `json:"stop_sequence"`
}
func GetTosReviewsClaude(text string) ([]database.TosReview, error) {
if len(text) < 10 {
return nil, errors.New("empty text")
}
var highlights []database.TosReview
messages := []AnthropicMessage{
{
Role: "user",
Content: text,
},
}
resp, err := getResponse(messages)
if err != nil {
log.Debug().Err(err).Msgf("Could not get response from OpenAI")
return highlights, err
}
if len(resp.Content) == 0 {
return highlights, nil
}
log.Debug().Msgf("TEXT RESP: \n %s", resp.Content[0].Text)
highlights, err = jsonToHighlights(resp.Content[0].Text)
if err != nil {
log.Debug().Err(err).Msgf("Json error")
return highlights, err
}
return highlights, nil
}
func getResponse(messages []AnthropicMessage) (AnthropicMessageResponse, error) {
apiKey := os.Getenv("ANTHROPIC_API_KEY")
model := utils.Getenv("ANTHROPIC_MODEL", "claude-3-sonnet-20240229")
url := "https://api.anthropic.com/v1/messages"
data := map[string]interface{}{
"model": model,
"system": sysPrompt,
"max_tokens": 4096,
"messages": messages,
}
jsonData, err := json.Marshal(data)
if err != nil {
log.Debug().Err(err).Msgf("Could not marshal data into json")
return AnthropicMessageResponse{}, err
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
log.Debug().Err(err).Msgf("Request error")
return AnthropicMessageResponse{}, err
}
req.Header.Set("x-api-key", apiKey)
req.Header.Set("anthropic-version", "2023-06-01")
req.Header.Set("content-type", "application/json")
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
log.Debug().Err(err).Msgf("Request error")
return AnthropicMessageResponse{}, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
log.Debug().Err(err).Msgf("Body read error")
return AnthropicMessageResponse{}, err
}
var response AnthropicMessageResponse
err = json.Unmarshal(body, &response)
if err != nil {
log.Debug().Err(err).Msgf("Could not unmarshal data into struct")
return AnthropicMessageResponse{}, err
}
return response, nil
}

View File

@ -1,146 +1,49 @@
package ai
import (
"context"
"encoding/json"
"errors"
"os"
"strings"
// func GetTosReviews(text string) ([]database.TosReview, error) {
// if len(text) < 10 {
// return nil, errors.New("empty text")
// }
"github.com/rs/zerolog/log"
openai "github.com/sashabaranov/go-openai"
// promptPrice := 0.01
// completionPrice := 0.03
// model := openai.GPT4Turbo1106
"pluja.dev/kycnot.me/database"
)
// // Call the OpenAI API
// var highlights []database.TosReview
// client := openai.NewClient(os.Getenv("OPENAI_API_KEY"))
// resp, err := client.CreateChatCompletion(
// context.Background(),
// openai.ChatCompletionRequest{
// Model: model,
// ResponseFormat: &openai.ChatCompletionResponseFormat{Type: openai.ChatCompletionResponseFormatTypeJSONObject},
// Messages: []openai.ChatCompletionMessage{
// {
// Role: openai.ChatMessageRoleSystem,
// Content: sysPrompt,
// },
// {
// Role: openai.ChatMessageRoleUser,
// Content: text,
// },
// },
// },
// )
// if err != nil {
// log.Debug().Err(err).Msg("Could not get response from OpenAI")
// return highlights, err
// }
var sysPrompt = `As a specialized analyzer of Terms and Conditions and Privacy Policies of cryptocurrency exchanges and services, your task is to review the provided legal documents in HTML format. Adhere to the following guidelines:
// log.Printf("Total Tokens: %v", resp.Usage.TotalTokens)
// log.Printf("Price for this request: $%v", float64(resp.Usage.PromptTokens/1000)*promptPrice+float64(resp.Usage.CompletionTokens/1000)*completionPrice)
1. Condense the information, avoid redundancies, and preserve meaning.
2. Translate complex legal jargon into simpler terms, ensuring both accuracy and brevity.
3. You are a privacy advocate, and you think privacy is a human right.
4. Never make your own assumptions.
5. If the document does not explicitly mention something, the 'warning' field MUST be 'false'.
6. Take a deep breath and work on this problem step-by-step.
// if len(resp.Choices) == 0 {
// return highlights, nil
// }
Use this JSON structure for your analysis and do not add any additional items:
{
"analysis": [
{
"title": "Transaction Monitoring",
"task": "Determine if the service monitors user transactions or activities, in regards to cryptocurrency. If not, warning is false.",
"warning": boolean,
"details": string, // A description in regard to the title and task. Provide citations when possible. Never make assumptions.
"section": string // The section(s) where you got this from.
},
{
"title": "User Identification",
"task": "Determine if users are required to verify their identity. If not, warning is false.",
// ...
},
{
"title": "3rd Party Data Sharing",
"task": "Determine if the service shares user data with third parties. If not, warning is false."
// ...
},
{
"title": "Data sharing with authorities",
"task": "Determine if user data is shared with authorities, law enforcement, or government agencies. If not, warning is false."
// ...
},
{
"title": "Logging",
"task": "Determine if the service logs user data, including IP addresses and/or transactions. If not, warning is false."ç
// ...
},
{
"title": "Blocking of funds",
"task": "Determine if the service can block or freeze the user funds, in relation to money or cryptocurrency. If not, warning is false."
// ...
},
{
"title": "Account termination/blocking",
"task": "Determine if user accounts could be terminated or blocked, in cases such as suspicious source of funds or transactions. If not, warning is false."
// ...
},
{
"title": "Transaction flagging",
"task": "Determine if the service has a system for flagging suspicious money/criptocurrency transactions. If not, warning is false."
// ...
}
]
}
`
func GetTosReviews(text string) ([]database.TosReview, error) {
if len(text) < 10 {
return nil, errors.New("empty text")
}
promptPrice := 0.01
completionPrice := 0.03
model := openai.GPT4Turbo1106
// Call the OpenAI API
var highlights []database.TosReview
client := openai.NewClient(os.Getenv("OPENAI_API_KEY"))
resp, err := client.CreateChatCompletion(
context.Background(),
openai.ChatCompletionRequest{
Model: model,
ResponseFormat: &openai.ChatCompletionResponseFormat{Type: openai.ChatCompletionResponseFormatTypeJSONObject},
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleSystem,
Content: sysPrompt,
},
{
Role: openai.ChatMessageRoleUser,
Content: text,
},
},
},
)
if err != nil {
log.Debug().Err(err).Msg("Could not get response from OpenAI")
return highlights, err
}
log.Printf("Total Tokens: %v", resp.Usage.TotalTokens)
log.Printf("Price for this request: $%v", float64(resp.Usage.PromptTokens/1000)*promptPrice+float64(resp.Usage.CompletionTokens/1000)*completionPrice)
if len(resp.Choices) == 0 {
return highlights, nil
}
highlights, err = jsonToHighlights(resp.Choices[0].Message.Content)
if err != nil {
return highlights, err
}
return highlights, nil
}
func jsonToHighlights(jsonStr string) ([]database.TosReview, error) {
jsonStr = strings.TrimPrefix(jsonStr, "```json\n")
jsonStr = strings.TrimSuffix(jsonStr, "\n```")
var highlights []database.TosReview
// Extract the `analysis` array from the JSON string and unmarshal it into the highlights slice.
var m map[string]json.RawMessage // use RawMessage for delayed decoding
err := json.Unmarshal([]byte(jsonStr), &m)
if err != nil {
return highlights, err
}
tosAnalysis, ok := m["analysis"]
if !ok {
return highlights, errors.New("key 'analysis' not found in JSON")
}
err = json.Unmarshal(tosAnalysis, &highlights)
if err != nil {
return highlights, err
}
return highlights, nil
}
// highlights, err = jsonToHighlights(resp.Choices[0].Message.Content)
// if err != nil {
// return highlights, err
// }
// return highlights, nil
// }

34
src/utils/ai/utils.go Normal file
View File

@ -0,0 +1,34 @@
package ai
import (
"encoding/json"
"errors"
"strings"
"pluja.dev/kycnot.me/database"
)
func jsonToHighlights(jsonStr string) ([]database.TosReview, error) {
jsonStr = strings.TrimPrefix(jsonStr, "```json\n")
jsonStr = strings.TrimSuffix(jsonStr, "\n```")
var highlights []database.TosReview
// Extract the `analysis` array from the JSON string and unmarshal it into the highlights slice.
var m map[string]json.RawMessage // use RawMessage for delayed decoding
err := json.Unmarshal([]byte(jsonStr), &m)
if err != nil {
return highlights, err
}
tosAnalysis, ok := m["analysis"]
if !ok {
return highlights, errors.New("key 'analysis' not found in JSON")
}
err = json.Unmarshal(tosAnalysis, &highlights)
if err != nil {
return highlights, err
}
return highlights, nil
}

View File

@ -1,116 +0,0 @@
package tos_scraper
import (
"strings"
"time"
"github.com/rs/zerolog/log"
"pluja.dev/kycnot.me/config"
"pluja.dev/kycnot.me/database"
"pluja.dev/kycnot.me/utils/ai"
)
func InitTosScraperDaemon() {
if !config.Conf.Scraper {
log.Warn().Msg("Scraper is disabled")
return
}
go func() {
// Calculate the duration until the first day of the next month
nextMonth := time.Now().AddDate(0, 1, 0)
firstDayNextMonth := time.Date(nextMonth.Year(), nextMonth.Month(), 1, 0, 0, 0, 0, nextMonth.Location())
duration := time.Until(firstDayNextMonth)
if config.Conf.Dev {
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Running scraper.")
trigerScraping()
}
for {
log.Info().Msgf("Next scraping in %v", duration)
// Set the ticker for that duration
ticker := time.NewTicker(duration)
// Wait for the ticker to tick
<-ticker.C
// Stop the ticker before resetting it
ticker.Stop()
// Add your code here to run on the first day of every month
if config.Conf.Dev {
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Will not run scraper again.")
} else {
trigerScraping()
}
// Reset ticker for next month
firstDayNextMonth = firstDayNextMonth.AddDate(0, 1, 0)
duration = time.Until(firstDayNextMonth)
log.Info().Msgf("Next scraping in %v, at %v", duration, firstDayNextMonth)
ticker.Reset(duration)
}
}()
}
func trigerScraping() {
log.Info().Msg("Starting ToS analysis...")
// Get all the Services from the DB
services, err := database.Pb.GetServices("pending=false && listed=true", "")
if err != nil {
log.Error().Err(err).Msg("Could not get services from DB")
return
}
log.Debug().Msgf("Found %v services", len(services))
// For each service, run the scraper
for _, service := range services {
log.Debug().Str("Name", service.Name).Msgf("Scraping ToS")
if len(service.TosUrls) == 0 {
log.Error().Msgf("Service %v has no ToS URL", service.Name)
continue
}
html, err := GetBody(service.TosUrls[0])
if err != nil {
log.Error().Err(err).Msgf("Could not get HTML from %v", service.TosUrls[0])
continue
}
// Check if service.LastTosReview is not older than one month, skip
if service.LastTosReview != "" {
log.Debug().Msgf("Last review for %s: %s", service.Name, service.LastTosReview)
t, _ := time.Parse("2006-01-02 15:04:05.000Z", service.LastTosReview)
if time.Since(t) < 30*24*time.Hour {
log.Debug().Msgf("Service %v ToS review is not older than one month, skipping", service.Name)
continue
}
}
html = strings.ReplaceAll(html, "\n", " ")
html = strings.ReplaceAll(html, "\t", " ")
html = strings.ReplaceAll(html, "\r", " ")
html = strings.TrimSpace(html)
highlights, err := ai.GetTosReviews(html)
if err != nil {
log.Error().Err(err).Msgf("Could not parse ToS from %v", service.TosUrls[0])
continue
}
log.Debug().Msgf("Found %v highlights", len(highlights))
service.TosReviews = highlights
service.LastTosReview = time.Now().Format("2006-01-02 15:04:05.000Z")
err = database.Pb.UpdateService(service.ID, service)
if err != nil {
log.Error().Err(err).Msgf("Could not save ToS highlights from %v", service.TosUrls[0])
continue
}
// Sleep 1 second
time.Sleep(1 * time.Second)
}
log.Info().Msg("Finished scraping ToS")
}

View File

@ -0,0 +1,63 @@
package tos_scraper
import (
"time"
"github.com/rs/zerolog/log"
"pluja.dev/kycnot.me/database"
"pluja.dev/kycnot.me/utils/ai"
)
func TrigerScraping() {
log.Info().Msg("Starting ToS analysis...")
services, err := database.Pb.GetServices("pending=false && listed=true", "")
if err != nil {
log.Error().Err(err).Msg("Could not get services from DB")
return
}
log.Debug().Msgf("Found %v services", len(services))
// For each service, run the scraper
for _, service := range services {
log.Debug().Str("Name", service.Name).Msgf("Scraping ToS")
if len(service.TosUrls) == 0 {
log.Error().Msgf("Service %v has no ToS URL", service.Name)
continue
}
body, err := GetWithJina(service.TosUrls[0])
if err != nil {
log.Error().Err(err).Msgf("Could not get body from %v", service.TosUrls[0])
continue
}
// Check if service.LastTosReview is not older than one month, skip
if service.LastTosReview != "" {
log.Debug().Msgf("Last review for %s: %s", service.Name, service.LastTosReview)
t, _ := time.Parse("2006-01-02 15:04:05.000Z", service.LastTosReview)
if time.Since(t) < 30*24*time.Hour {
log.Debug().Msgf("Service %v ToS review is not older than one month, skipping", service.Name)
continue
}
}
highlights, err := ai.GetTosReviewsClaude(body)
if err != nil {
log.Error().Err(err).Msgf("Could not parse ToS from %v", service.TosUrls[0])
continue
}
log.Debug().Msgf("Found %v highlights", len(highlights))
service.TosReviews = highlights
service.LastTosReview = time.Now().Format("2006-01-02 15:04:05.000Z")
err = database.Pb.UpdateService(service.ID, service)
if err != nil {
log.Error().Err(err).Msgf("Could not save ToS highlights from %v", service.TosUrls[0])
continue
}
}
log.Info().Msg("Finished scraping ToS")
}

View File

@ -1,19 +1,9 @@
package tos_scraper
import (
"encoding/json"
"fmt"
"io"
"math/rand"
"net/http"
"net/http/cookiejar"
"regexp"
"strings"
"time"
"golang.org/x/net/html"
"pluja.dev/kycnot.me/utils"
)
type ScraperResponse struct {
@ -21,120 +11,22 @@ type ScraperResponse struct {
Length int `json:"length"`
}
// Using the crawler API, get the HTML content of a given URL.
func GetBody(url string) (string, error) {
// Send a GET request to os.Getenv("SCRAPER_API_URL")/scrap?url=url
// Unmarshal the response and return the HTML content
scraper_api := utils.Getenv("CRAWLER_API_URL", "http://localhost:3011")
// Request the HTML content
resp, err := http.Get(fmt.Sprintf("%v/scrap?url=%v", scraper_api, url))
func GetWithJina(url string) (string, error) {
resp, err := http.Get("https://r.jina.ai/" + url)
if err != nil {
fmt.Println("Error making request:", err)
return "", err
}
// Read the response body
defer resp.Body.Close()
// Read the response body as plain text
body, err := io.ReadAll(resp.Body)
if err != nil {
fmt.Println("Error reading response body:", err)
return "", err
}
// Store the response body in a variable
responseText := string(body)
// Unmarshal the response
var scraperResponse ScraperResponse
err = json.Unmarshal(body, &scraperResponse)
if err != nil {
return "", err
}
// Return the HTML content
return scraperResponse.Content, nil
}
// GetBodyHtml fetches the body content from a given URL, removes unwanted elements, and returns plain text.
func GetBodyHtml(url string) (string, error) {
// Set user agent to avoid being blocked by Cloudflare
jar, _ := cookiejar.New(nil)
client := &http.Client{
Jar: jar,
}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return "", err
}
userAgents := []string{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
}
s := rand.NewSource(time.Now().UnixNano())
r := rand.New(s)
req.Header.Set(
"User-Agent",
userAgents[r.Intn(len(userAgents))],
)
req.Header.Set(
"Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
)
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
req.Header.Set("Cache-Control", "no-cache")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("DNT", "1")
req.Header.Set("Referrer", fmt.Sprintf("https://www.google.com/search?q=%v", url))
req.Header.Set("Upgrade-Insecure-Requests", "1")
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return "", fmt.Errorf("status code error: %d %s", resp.StatusCode, resp.Status)
}
doc, err := html.Parse(resp.Body)
if err != nil {
return "", err
}
// Extract and return the text content
textContent := extractTextContent(doc)
// Compress whitespace in the resulting text
compressedText := compressWhitespace(textContent)
return compressedText, nil
}
// extractTextContent traverses the HTML DOM and extracts concatenated text content.
func extractTextContent(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
if n.Type == html.ElementNode {
// Skip script and style elements
if n.Data == "script" || n.Data == "style" || n.Data == "nav" || n.Data == "footer" || n.Data == "header" || n.Data == "head" {
return ""
}
}
var textContent string
for c := n.FirstChild; c != nil; c = c.NextSibling {
textContent += extractTextContent(c)
}
return textContent
}
// compressWhitespace replaces sequences of whitespace with a single space.
func compressWhitespace(input string) string {
space := regexp.MustCompile(`\s+`)
return space.ReplaceAllString(strings.TrimSpace(input), " ")
return responseText, nil
}