mirror of
https://codeberg.org/pluja/kycnot.me
synced 2025-02-05 18:05:22 -05:00
update scraper to run with jina and claude
This commit is contained in:
parent
69b6e8bf3b
commit
717d56d747
@ -29,7 +29,6 @@ func init() {
|
||||
config.Conf = config.Config{
|
||||
Dev: *dev,
|
||||
Cache: *nocache,
|
||||
Scraper: *scraper,
|
||||
ListenAddr: *listen,
|
||||
}
|
||||
|
||||
@ -68,10 +67,9 @@ func init() {
|
||||
log.Info().Msg("Initializing database.")
|
||||
database.InitPocketbase()
|
||||
|
||||
// AI ToS Scraper init
|
||||
if config.Conf.Scraper {
|
||||
log.Info().Msg("Initializing AI ToS scraper.")
|
||||
tos_scraper.InitTosScraperDaemon()
|
||||
if *scraper {
|
||||
tos_scraper.TrigerScraping()
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
// Maintenance init
|
||||
|
126
src/utils/ai/claude.go
Normal file
126
src/utils/ai/claude.go
Normal file
@ -0,0 +1,126 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
"pluja.dev/kycnot.me/database"
|
||||
"pluja.dev/kycnot.me/utils"
|
||||
)
|
||||
|
||||
type AnthropicMessage struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
type AnthropicMessagePartResponse struct {
|
||||
Type string `json:"type"`
|
||||
Text string `json:"text"`
|
||||
|
||||
// Optional fields, only present for tools responses
|
||||
ID string `json:"id,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
Input map[string]interface{} `json:"input,omitempty"`
|
||||
}
|
||||
|
||||
type AnthropicMessageResponse struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Model string `json:"model"`
|
||||
Role string `json:"role"`
|
||||
Content []AnthropicMessagePartResponse `json:"content"`
|
||||
StopReason string `json:"stop_reason"`
|
||||
Stop string `json:"stop"`
|
||||
StopSequence string `json:"stop_sequence"`
|
||||
}
|
||||
|
||||
func GetTosReviewsClaude(text string) ([]database.TosReview, error) {
|
||||
if len(text) < 10 {
|
||||
return nil, errors.New("empty text")
|
||||
}
|
||||
|
||||
var highlights []database.TosReview
|
||||
|
||||
messages := []AnthropicMessage{
|
||||
{
|
||||
Role: "user",
|
||||
Content: text,
|
||||
},
|
||||
}
|
||||
|
||||
resp, err := getResponse(messages)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msgf("Could not get response from OpenAI")
|
||||
return highlights, err
|
||||
}
|
||||
|
||||
if len(resp.Content) == 0 {
|
||||
return highlights, nil
|
||||
}
|
||||
|
||||
log.Debug().Msgf("TEXT RESP: \n %s", resp.Content[0].Text)
|
||||
highlights, err = jsonToHighlights(resp.Content[0].Text)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msgf("Json error")
|
||||
return highlights, err
|
||||
}
|
||||
return highlights, nil
|
||||
}
|
||||
|
||||
func getResponse(messages []AnthropicMessage) (AnthropicMessageResponse, error) {
|
||||
apiKey := os.Getenv("ANTHROPIC_API_KEY")
|
||||
model := utils.Getenv("ANTHROPIC_MODEL", "claude-3-sonnet-20240229")
|
||||
|
||||
url := "https://api.anthropic.com/v1/messages"
|
||||
|
||||
data := map[string]interface{}{
|
||||
"model": model,
|
||||
"system": sysPrompt,
|
||||
"max_tokens": 4096,
|
||||
"messages": messages,
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(data)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msgf("Could not marshal data into json")
|
||||
return AnthropicMessageResponse{}, err
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msgf("Request error")
|
||||
return AnthropicMessageResponse{}, err
|
||||
}
|
||||
|
||||
req.Header.Set("x-api-key", apiKey)
|
||||
req.Header.Set("anthropic-version", "2023-06-01")
|
||||
req.Header.Set("content-type", "application/json")
|
||||
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msgf("Request error")
|
||||
return AnthropicMessageResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msgf("Body read error")
|
||||
return AnthropicMessageResponse{}, err
|
||||
}
|
||||
|
||||
var response AnthropicMessageResponse
|
||||
err = json.Unmarshal(body, &response)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msgf("Could not unmarshal data into struct")
|
||||
return AnthropicMessageResponse{}, err
|
||||
}
|
||||
|
||||
return response, nil
|
||||
}
|
@ -1,146 +1,49 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"strings"
|
||||
// func GetTosReviews(text string) ([]database.TosReview, error) {
|
||||
// if len(text) < 10 {
|
||||
// return nil, errors.New("empty text")
|
||||
// }
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
openai "github.com/sashabaranov/go-openai"
|
||||
// promptPrice := 0.01
|
||||
// completionPrice := 0.03
|
||||
// model := openai.GPT4Turbo1106
|
||||
|
||||
"pluja.dev/kycnot.me/database"
|
||||
)
|
||||
// // Call the OpenAI API
|
||||
// var highlights []database.TosReview
|
||||
// client := openai.NewClient(os.Getenv("OPENAI_API_KEY"))
|
||||
// resp, err := client.CreateChatCompletion(
|
||||
// context.Background(),
|
||||
// openai.ChatCompletionRequest{
|
||||
// Model: model,
|
||||
// ResponseFormat: &openai.ChatCompletionResponseFormat{Type: openai.ChatCompletionResponseFormatTypeJSONObject},
|
||||
// Messages: []openai.ChatCompletionMessage{
|
||||
// {
|
||||
// Role: openai.ChatMessageRoleSystem,
|
||||
// Content: sysPrompt,
|
||||
// },
|
||||
// {
|
||||
// Role: openai.ChatMessageRoleUser,
|
||||
// Content: text,
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// )
|
||||
// if err != nil {
|
||||
// log.Debug().Err(err).Msg("Could not get response from OpenAI")
|
||||
// return highlights, err
|
||||
// }
|
||||
|
||||
var sysPrompt = `As a specialized analyzer of Terms and Conditions and Privacy Policies of cryptocurrency exchanges and services, your task is to review the provided legal documents in HTML format. Adhere to the following guidelines:
|
||||
// log.Printf("Total Tokens: %v", resp.Usage.TotalTokens)
|
||||
// log.Printf("Price for this request: $%v", float64(resp.Usage.PromptTokens/1000)*promptPrice+float64(resp.Usage.CompletionTokens/1000)*completionPrice)
|
||||
|
||||
1. Condense the information, avoid redundancies, and preserve meaning.
|
||||
2. Translate complex legal jargon into simpler terms, ensuring both accuracy and brevity.
|
||||
3. You are a privacy advocate, and you think privacy is a human right.
|
||||
4. Never make your own assumptions.
|
||||
5. If the document does not explicitly mention something, the 'warning' field MUST be 'false'.
|
||||
6. Take a deep breath and work on this problem step-by-step.
|
||||
// if len(resp.Choices) == 0 {
|
||||
// return highlights, nil
|
||||
// }
|
||||
|
||||
Use this JSON structure for your analysis and do not add any additional items:
|
||||
|
||||
{
|
||||
"analysis": [
|
||||
{
|
||||
"title": "Transaction Monitoring",
|
||||
"task": "Determine if the service monitors user transactions or activities, in regards to cryptocurrency. If not, warning is false.",
|
||||
"warning": boolean,
|
||||
"details": string, // A description in regard to the title and task. Provide citations when possible. Never make assumptions.
|
||||
"section": string // The section(s) where you got this from.
|
||||
},
|
||||
{
|
||||
"title": "User Identification",
|
||||
"task": "Determine if users are required to verify their identity. If not, warning is false.",
|
||||
// ...
|
||||
},
|
||||
{
|
||||
"title": "3rd Party Data Sharing",
|
||||
"task": "Determine if the service shares user data with third parties. If not, warning is false."
|
||||
// ...
|
||||
},
|
||||
{
|
||||
"title": "Data sharing with authorities",
|
||||
"task": "Determine if user data is shared with authorities, law enforcement, or government agencies. If not, warning is false."
|
||||
// ...
|
||||
},
|
||||
{
|
||||
"title": "Logging",
|
||||
"task": "Determine if the service logs user data, including IP addresses and/or transactions. If not, warning is false."ç
|
||||
// ...
|
||||
},
|
||||
{
|
||||
"title": "Blocking of funds",
|
||||
"task": "Determine if the service can block or freeze the user funds, in relation to money or cryptocurrency. If not, warning is false."
|
||||
// ...
|
||||
},
|
||||
{
|
||||
"title": "Account termination/blocking",
|
||||
"task": "Determine if user accounts could be terminated or blocked, in cases such as suspicious source of funds or transactions. If not, warning is false."
|
||||
// ...
|
||||
},
|
||||
{
|
||||
"title": "Transaction flagging",
|
||||
"task": "Determine if the service has a system for flagging suspicious money/criptocurrency transactions. If not, warning is false."
|
||||
// ...
|
||||
}
|
||||
]
|
||||
}
|
||||
`
|
||||
|
||||
func GetTosReviews(text string) ([]database.TosReview, error) {
|
||||
if len(text) < 10 {
|
||||
return nil, errors.New("empty text")
|
||||
}
|
||||
|
||||
promptPrice := 0.01
|
||||
completionPrice := 0.03
|
||||
model := openai.GPT4Turbo1106
|
||||
|
||||
// Call the OpenAI API
|
||||
var highlights []database.TosReview
|
||||
client := openai.NewClient(os.Getenv("OPENAI_API_KEY"))
|
||||
resp, err := client.CreateChatCompletion(
|
||||
context.Background(),
|
||||
openai.ChatCompletionRequest{
|
||||
Model: model,
|
||||
ResponseFormat: &openai.ChatCompletionResponseFormat{Type: openai.ChatCompletionResponseFormatTypeJSONObject},
|
||||
Messages: []openai.ChatCompletionMessage{
|
||||
{
|
||||
Role: openai.ChatMessageRoleSystem,
|
||||
Content: sysPrompt,
|
||||
},
|
||||
{
|
||||
Role: openai.ChatMessageRoleUser,
|
||||
Content: text,
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
log.Debug().Err(err).Msg("Could not get response from OpenAI")
|
||||
return highlights, err
|
||||
}
|
||||
|
||||
log.Printf("Total Tokens: %v", resp.Usage.TotalTokens)
|
||||
log.Printf("Price for this request: $%v", float64(resp.Usage.PromptTokens/1000)*promptPrice+float64(resp.Usage.CompletionTokens/1000)*completionPrice)
|
||||
|
||||
if len(resp.Choices) == 0 {
|
||||
return highlights, nil
|
||||
}
|
||||
|
||||
highlights, err = jsonToHighlights(resp.Choices[0].Message.Content)
|
||||
if err != nil {
|
||||
return highlights, err
|
||||
}
|
||||
return highlights, nil
|
||||
}
|
||||
|
||||
func jsonToHighlights(jsonStr string) ([]database.TosReview, error) {
|
||||
jsonStr = strings.TrimPrefix(jsonStr, "```json\n")
|
||||
jsonStr = strings.TrimSuffix(jsonStr, "\n```")
|
||||
var highlights []database.TosReview
|
||||
// Extract the `analysis` array from the JSON string and unmarshal it into the highlights slice.
|
||||
|
||||
var m map[string]json.RawMessage // use RawMessage for delayed decoding
|
||||
err := json.Unmarshal([]byte(jsonStr), &m)
|
||||
if err != nil {
|
||||
return highlights, err
|
||||
}
|
||||
|
||||
tosAnalysis, ok := m["analysis"]
|
||||
if !ok {
|
||||
return highlights, errors.New("key 'analysis' not found in JSON")
|
||||
}
|
||||
|
||||
err = json.Unmarshal(tosAnalysis, &highlights)
|
||||
if err != nil {
|
||||
return highlights, err
|
||||
}
|
||||
|
||||
return highlights, nil
|
||||
}
|
||||
// highlights, err = jsonToHighlights(resp.Choices[0].Message.Content)
|
||||
// if err != nil {
|
||||
// return highlights, err
|
||||
// }
|
||||
// return highlights, nil
|
||||
// }
|
||||
|
34
src/utils/ai/utils.go
Normal file
34
src/utils/ai/utils.go
Normal file
@ -0,0 +1,34 @@
|
||||
package ai
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
"pluja.dev/kycnot.me/database"
|
||||
)
|
||||
|
||||
func jsonToHighlights(jsonStr string) ([]database.TosReview, error) {
|
||||
jsonStr = strings.TrimPrefix(jsonStr, "```json\n")
|
||||
jsonStr = strings.TrimSuffix(jsonStr, "\n```")
|
||||
var highlights []database.TosReview
|
||||
// Extract the `analysis` array from the JSON string and unmarshal it into the highlights slice.
|
||||
|
||||
var m map[string]json.RawMessage // use RawMessage for delayed decoding
|
||||
err := json.Unmarshal([]byte(jsonStr), &m)
|
||||
if err != nil {
|
||||
return highlights, err
|
||||
}
|
||||
|
||||
tosAnalysis, ok := m["analysis"]
|
||||
if !ok {
|
||||
return highlights, errors.New("key 'analysis' not found in JSON")
|
||||
}
|
||||
|
||||
err = json.Unmarshal(tosAnalysis, &highlights)
|
||||
if err != nil {
|
||||
return highlights, err
|
||||
}
|
||||
|
||||
return highlights, nil
|
||||
}
|
@ -1,116 +0,0 @@
|
||||
package tos_scraper
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"pluja.dev/kycnot.me/config"
|
||||
"pluja.dev/kycnot.me/database"
|
||||
"pluja.dev/kycnot.me/utils/ai"
|
||||
)
|
||||
|
||||
func InitTosScraperDaemon() {
|
||||
if !config.Conf.Scraper {
|
||||
log.Warn().Msg("Scraper is disabled")
|
||||
return
|
||||
}
|
||||
go func() {
|
||||
// Calculate the duration until the first day of the next month
|
||||
nextMonth := time.Now().AddDate(0, 1, 0)
|
||||
firstDayNextMonth := time.Date(nextMonth.Year(), nextMonth.Month(), 1, 0, 0, 0, 0, nextMonth.Location())
|
||||
duration := time.Until(firstDayNextMonth)
|
||||
if config.Conf.Dev {
|
||||
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Running scraper.")
|
||||
trigerScraping()
|
||||
}
|
||||
for {
|
||||
log.Info().Msgf("Next scraping in %v", duration)
|
||||
|
||||
// Set the ticker for that duration
|
||||
ticker := time.NewTicker(duration)
|
||||
|
||||
// Wait for the ticker to tick
|
||||
<-ticker.C
|
||||
|
||||
// Stop the ticker before resetting it
|
||||
ticker.Stop()
|
||||
|
||||
// Add your code here to run on the first day of every month
|
||||
if config.Conf.Dev {
|
||||
log.Debug().Bool("DevMode", config.Conf.Dev).Msg("Will not run scraper again.")
|
||||
} else {
|
||||
trigerScraping()
|
||||
}
|
||||
|
||||
// Reset ticker for next month
|
||||
firstDayNextMonth = firstDayNextMonth.AddDate(0, 1, 0)
|
||||
duration = time.Until(firstDayNextMonth)
|
||||
log.Info().Msgf("Next scraping in %v, at %v", duration, firstDayNextMonth)
|
||||
ticker.Reset(duration)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func trigerScraping() {
|
||||
log.Info().Msg("Starting ToS analysis...")
|
||||
// Get all the Services from the DB
|
||||
services, err := database.Pb.GetServices("pending=false && listed=true", "")
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("Could not get services from DB")
|
||||
return
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Found %v services", len(services))
|
||||
|
||||
// For each service, run the scraper
|
||||
for _, service := range services {
|
||||
log.Debug().Str("Name", service.Name).Msgf("Scraping ToS")
|
||||
if len(service.TosUrls) == 0 {
|
||||
log.Error().Msgf("Service %v has no ToS URL", service.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
html, err := GetBody(service.TosUrls[0])
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("Could not get HTML from %v", service.TosUrls[0])
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if service.LastTosReview is not older than one month, skip
|
||||
if service.LastTosReview != "" {
|
||||
log.Debug().Msgf("Last review for %s: %s", service.Name, service.LastTosReview)
|
||||
t, _ := time.Parse("2006-01-02 15:04:05.000Z", service.LastTosReview)
|
||||
if time.Since(t) < 30*24*time.Hour {
|
||||
log.Debug().Msgf("Service %v ToS review is not older than one month, skipping", service.Name)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
html = strings.ReplaceAll(html, "\n", " ")
|
||||
html = strings.ReplaceAll(html, "\t", " ")
|
||||
html = strings.ReplaceAll(html, "\r", " ")
|
||||
html = strings.TrimSpace(html)
|
||||
|
||||
highlights, err := ai.GetTosReviews(html)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("Could not parse ToS from %v", service.TosUrls[0])
|
||||
continue
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Found %v highlights", len(highlights))
|
||||
service.TosReviews = highlights
|
||||
service.LastTosReview = time.Now().Format("2006-01-02 15:04:05.000Z")
|
||||
err = database.Pb.UpdateService(service.ID, service)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("Could not save ToS highlights from %v", service.TosUrls[0])
|
||||
continue
|
||||
}
|
||||
|
||||
// Sleep 1 second
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
|
||||
log.Info().Msg("Finished scraping ToS")
|
||||
}
|
63
src/utils/tos_scraper/main.go
Normal file
63
src/utils/tos_scraper/main.go
Normal file
@ -0,0 +1,63 @@
|
||||
package tos_scraper
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"pluja.dev/kycnot.me/database"
|
||||
"pluja.dev/kycnot.me/utils/ai"
|
||||
)
|
||||
|
||||
func TrigerScraping() {
|
||||
log.Info().Msg("Starting ToS analysis...")
|
||||
services, err := database.Pb.GetServices("pending=false && listed=true", "")
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("Could not get services from DB")
|
||||
return
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Found %v services", len(services))
|
||||
|
||||
// For each service, run the scraper
|
||||
for _, service := range services {
|
||||
log.Debug().Str("Name", service.Name).Msgf("Scraping ToS")
|
||||
if len(service.TosUrls) == 0 {
|
||||
log.Error().Msgf("Service %v has no ToS URL", service.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
body, err := GetWithJina(service.TosUrls[0])
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("Could not get body from %v", service.TosUrls[0])
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if service.LastTosReview is not older than one month, skip
|
||||
if service.LastTosReview != "" {
|
||||
log.Debug().Msgf("Last review for %s: %s", service.Name, service.LastTosReview)
|
||||
t, _ := time.Parse("2006-01-02 15:04:05.000Z", service.LastTosReview)
|
||||
if time.Since(t) < 30*24*time.Hour {
|
||||
log.Debug().Msgf("Service %v ToS review is not older than one month, skipping", service.Name)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
highlights, err := ai.GetTosReviewsClaude(body)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("Could not parse ToS from %v", service.TosUrls[0])
|
||||
continue
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Found %v highlights", len(highlights))
|
||||
service.TosReviews = highlights
|
||||
service.LastTosReview = time.Now().Format("2006-01-02 15:04:05.000Z")
|
||||
err = database.Pb.UpdateService(service.ID, service)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("Could not save ToS highlights from %v", service.TosUrls[0])
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
log.Info().Msg("Finished scraping ToS")
|
||||
}
|
@ -1,19 +1,9 @@
|
||||
package tos_scraper
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
|
||||
"pluja.dev/kycnot.me/utils"
|
||||
)
|
||||
|
||||
type ScraperResponse struct {
|
||||
@ -21,120 +11,22 @@ type ScraperResponse struct {
|
||||
Length int `json:"length"`
|
||||
}
|
||||
|
||||
// Using the crawler API, get the HTML content of a given URL.
|
||||
func GetBody(url string) (string, error) {
|
||||
// Send a GET request to os.Getenv("SCRAPER_API_URL")/scrap?url=url
|
||||
// Unmarshal the response and return the HTML content
|
||||
scraper_api := utils.Getenv("CRAWLER_API_URL", "http://localhost:3011")
|
||||
|
||||
// Request the HTML content
|
||||
resp, err := http.Get(fmt.Sprintf("%v/scrap?url=%v", scraper_api, url))
|
||||
func GetWithJina(url string) (string, error) {
|
||||
resp, err := http.Get("https://r.jina.ai/" + url)
|
||||
if err != nil {
|
||||
fmt.Println("Error making request:", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Read the response body
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Read the response body as plain text
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
fmt.Println("Error reading response body:", err)
|
||||
return "", err
|
||||
}
|
||||
// Store the response body in a variable
|
||||
responseText := string(body)
|
||||
|
||||
// Unmarshal the response
|
||||
var scraperResponse ScraperResponse
|
||||
err = json.Unmarshal(body, &scraperResponse)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Return the HTML content
|
||||
return scraperResponse.Content, nil
|
||||
}
|
||||
|
||||
// GetBodyHtml fetches the body content from a given URL, removes unwanted elements, and returns plain text.
|
||||
func GetBodyHtml(url string) (string, error) {
|
||||
// Set user agent to avoid being blocked by Cloudflare
|
||||
jar, _ := cookiejar.New(nil)
|
||||
client := &http.Client{
|
||||
Jar: jar,
|
||||
}
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
userAgents := []string{
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||
}
|
||||
|
||||
s := rand.NewSource(time.Now().UnixNano())
|
||||
r := rand.New(s)
|
||||
|
||||
req.Header.Set(
|
||||
"User-Agent",
|
||||
userAgents[r.Intn(len(userAgents))],
|
||||
)
|
||||
req.Header.Set(
|
||||
"Accept",
|
||||
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
||||
)
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
req.Header.Set("Cache-Control", "no-cache")
|
||||
req.Header.Set("Connection", "keep-alive")
|
||||
req.Header.Set("DNT", "1")
|
||||
req.Header.Set("Referrer", fmt.Sprintf("https://www.google.com/search?q=%v", url))
|
||||
req.Header.Set("Upgrade-Insecure-Requests", "1")
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return "", fmt.Errorf("status code error: %d %s", resp.StatusCode, resp.Status)
|
||||
}
|
||||
|
||||
doc, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Extract and return the text content
|
||||
textContent := extractTextContent(doc)
|
||||
|
||||
// Compress whitespace in the resulting text
|
||||
compressedText := compressWhitespace(textContent)
|
||||
|
||||
return compressedText, nil
|
||||
}
|
||||
|
||||
// extractTextContent traverses the HTML DOM and extracts concatenated text content.
|
||||
func extractTextContent(n *html.Node) string {
|
||||
if n.Type == html.TextNode {
|
||||
return n.Data
|
||||
}
|
||||
if n.Type == html.ElementNode {
|
||||
// Skip script and style elements
|
||||
if n.Data == "script" || n.Data == "style" || n.Data == "nav" || n.Data == "footer" || n.Data == "header" || n.Data == "head" {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
var textContent string
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
textContent += extractTextContent(c)
|
||||
}
|
||||
|
||||
return textContent
|
||||
}
|
||||
|
||||
// compressWhitespace replaces sequences of whitespace with a single space.
|
||||
func compressWhitespace(input string) string {
|
||||
space := regexp.MustCompile(`\s+`)
|
||||
return space.ReplaceAllString(strings.TrimSpace(input), " ")
|
||||
return responseText, nil
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user