mirror of
https://codeberg.org/pluja/kycnot.me
synced 2025-08-12 08:20:35 -04:00
massive update
This commit is contained in:
parent
6decdcb4fb
commit
c64ea21904
46 changed files with 5663 additions and 834 deletions
51
crawler/index.js
Normal file
51
crawler/index.js
Normal file
|
@ -0,0 +1,51 @@
|
|||
const express = require('express');
|
||||
const { PlaywrightCrawler } = require('crawlee');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const app = express();
|
||||
const port = 3011;
|
||||
|
||||
let globalContent = "";
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
async requestHandler({ page }) {
|
||||
const content = await page.innerHTML('body');
|
||||
globalContent = content;
|
||||
return content;
|
||||
}
|
||||
});
|
||||
|
||||
app.get('/scrap', async (req, res) => {
|
||||
const url = req.query.url;
|
||||
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "No URL provided" });
|
||||
}
|
||||
|
||||
try {
|
||||
await crawler.run([url]);
|
||||
|
||||
const $ = cheerio.load(globalContent); // Load the global content into Cheerio
|
||||
$('header, footer, script, style, svg, img, video').remove(); // Remove unnecessary elements
|
||||
$('*').each(function () {
|
||||
// For each element, remove all attributes
|
||||
const attributes = Object.keys(this.attribs);
|
||||
attributes.forEach(attr => {
|
||||
$(this).removeAttr(attr);
|
||||
});
|
||||
});
|
||||
const cleanedContent = $.html(); // Get the cleaned HTML
|
||||
|
||||
res.json({
|
||||
content: cleanedContent,
|
||||
length: cleanedContent.length
|
||||
});
|
||||
} catch (error) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
app.listen(port, () => {
|
||||
console.log(`Server running on http://localhost:${port}`);
|
||||
});
|
Loading…
Add table
Add a link
Reference in a new issue