mirror of
https://codeberg.org/pluja/kycnot.me
synced 2024-10-01 01:05:59 -04:00
52 lines
1.4 KiB
JavaScript
52 lines
1.4 KiB
JavaScript
const express = require('express');
|
|
const { PlaywrightCrawler } = require('crawlee');
|
|
const cheerio = require('cheerio');
|
|
|
|
const app = express();
|
|
const port = 3011;
|
|
|
|
let globalContent = "";
|
|
|
|
const crawler = new PlaywrightCrawler({
|
|
async requestHandler({ page }) {
|
|
const content = await page.innerHTML('body');
|
|
globalContent = content;
|
|
return content;
|
|
}
|
|
});
|
|
|
|
app.get('/scrap', async (req, res) => {
|
|
const url = req.query.url;
|
|
|
|
if (!url) {
|
|
return res.status(400).json({ error: "No URL provided" });
|
|
}
|
|
|
|
try {
|
|
await crawler.run([url]);
|
|
|
|
const $ = cheerio.load(globalContent); // Load the global content into Cheerio
|
|
$('header, footer, script, style, svg, img, video').remove(); // Remove unnecessary elements
|
|
$('*').each(function () {
|
|
// For each element, remove all attributes
|
|
const attributes = Object.keys(this.attribs);
|
|
attributes.forEach(attr => {
|
|
$(this).removeAttr(attr);
|
|
});
|
|
});
|
|
const cleanedContent = $.html(); // Get the cleaned HTML
|
|
|
|
res.json({
|
|
content: cleanedContent,
|
|
length: cleanedContent.length
|
|
});
|
|
} catch (error) {
|
|
res.status(500).json({ error: error.message });
|
|
}
|
|
});
|
|
|
|
|
|
app.listen(port, () => {
|
|
console.log(`Server running on http://localhost:${port}`);
|
|
});
|