kycnot.me/crawler/index.js

52 lines
1.4 KiB
JavaScript
Raw Normal View History

2024-01-27 06:42:12 +01:00
const express = require('express');
const { PlaywrightCrawler } = require('crawlee');
const cheerio = require('cheerio');
const app = express();
const port = 3011;
let globalContent = "";
const crawler = new PlaywrightCrawler({
async requestHandler({ page }) {
const content = await page.innerHTML('body');
globalContent = content;
return content;
}
});
app.get('/scrap', async (req, res) => {
const url = req.query.url;
if (!url) {
return res.status(400).json({ error: "No URL provided" });
}
try {
await crawler.run([url]);
const $ = cheerio.load(globalContent); // Load the global content into Cheerio
$('header, footer, script, style, svg, img, video').remove(); // Remove unnecessary elements
$('*').each(function () {
// For each element, remove all attributes
const attributes = Object.keys(this.attribs);
attributes.forEach(attr => {
$(this).removeAttr(attr);
});
});
const cleanedContent = $.html(); // Get the cleaned HTML
res.json({
content: cleanedContent,
length: cleanedContent.length
});
} catch (error) {
res.status(500).json({ error: error.message });
}
});
app.listen(port, () => {
console.log(`Server running on http://localhost:${port}`);
});