const express = require('express'); const { PlaywrightCrawler } = require('crawlee'); const cheerio = require('cheerio'); const app = express(); const port = 3011; let globalContent = ""; const crawler = new PlaywrightCrawler({ async requestHandler({ page }) { const content = await page.innerHTML('body'); globalContent = content; return content; } }); app.get('/scrap', async (req, res) => { const url = req.query.url; if (!url) { return res.status(400).json({ error: "No URL provided" }); } try { await crawler.run([url]); const $ = cheerio.load(globalContent); // Load the global content into Cheerio $('header, footer, script, style, svg, img, video').remove(); // Remove unnecessary elements $('*').each(function () { // For each element, remove all attributes const attributes = Object.keys(this.attribs); attributes.forEach(attr => { $(this).removeAttr(attr); }); }); const cleanedContent = $.html(); // Get the cleaned HTML res.json({ content: cleanedContent, length: cleanedContent.length }); } catch (error) { res.status(500).json({ error: error.message }); } }); app.listen(port, () => { console.log(`Server running on http://localhost:${port}`); });