mirror of
https://codeberg.org/pluja/kycnot.me
synced 2025-01-07 13:17:57 -05:00
delete crawler in favor of jina-ai reader
This commit is contained in:
parent
c510868e38
commit
483ba8b415
8
crawler/.gitignore
vendored
8
crawler/.gitignore
vendored
@ -1,8 +0,0 @@
|
||||
# This file tells Git which files shouldn't be added to source control
|
||||
|
||||
.idea
|
||||
dist
|
||||
node_modules
|
||||
apify_storage
|
||||
crawlee_storage
|
||||
storage
|
@ -1,29 +0,0 @@
|
||||
# Specify the base Docker image. You can read more about
|
||||
# the available images at https://crawlee.dev/docs/guides/docker-images
|
||||
# You can also use any other image from Docker Hub.
|
||||
FROM apify/actor-node-playwright-chrome:18
|
||||
|
||||
# Copy just package.json and package-lock.json
|
||||
# to speed up the build using Docker layer cache.
|
||||
COPY --chown=myuser package*.json ./
|
||||
|
||||
# Install NPM packages, skip optional and development dependencies to
|
||||
# keep the image small. Avoid logging too much and print the dependency
|
||||
# tree for debugging
|
||||
RUN npm --quiet set progress=false \
|
||||
&& npm install --omit=dev --omit=optional \
|
||||
&& echo "Installed NPM packages:" \
|
||||
&& (npm list --omit=dev --all || true) \
|
||||
&& echo "Node.js version:" \
|
||||
&& node --version \
|
||||
&& echo "NPM version:" \
|
||||
&& npm --version
|
||||
|
||||
# Next, copy the remaining files and directories with the source code.
|
||||
# Since we do this after NPM install, quick build will be really fast
|
||||
# for most source file changes.
|
||||
COPY --chown=myuser . ./
|
||||
|
||||
|
||||
# Run the image.
|
||||
CMD node index.js
|
@ -1,24 +0,0 @@
|
||||
# Crawlee + PlaywrightCrawler + JavaScript
|
||||
|
||||
Crawler API for kycnot.me
|
||||
|
||||
## `GET /scrap`
|
||||
|
||||
### URL Query Parameters:
|
||||
|
||||
- `url`: URL to scrap
|
||||
|
||||
### Response:
|
||||
|
||||
```json
|
||||
{
|
||||
"content": string,
|
||||
"length": number,
|
||||
}
|
||||
```
|
||||
|
||||
### Example:
|
||||
|
||||
```bash
|
||||
curl -X GET "http://localhost:3011/scrap?url=https://localmonero.co/nojs/faq"
|
||||
```
|
@ -1,51 +0,0 @@
|
||||
const express = require('express');
|
||||
const { PlaywrightCrawler } = require('crawlee');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const app = express();
|
||||
const port = 3011;
|
||||
|
||||
let globalContent = "";
|
||||
|
||||
const crawler = new PlaywrightCrawler({
|
||||
async requestHandler({ page }) {
|
||||
const content = await page.innerHTML('body');
|
||||
globalContent = content;
|
||||
return content;
|
||||
}
|
||||
});
|
||||
|
||||
app.get('/scrap', async (req, res) => {
|
||||
const url = req.query.url;
|
||||
|
||||
if (!url) {
|
||||
return res.status(400).json({ error: "No URL provided" });
|
||||
}
|
||||
|
||||
try {
|
||||
await crawler.run([url]);
|
||||
|
||||
const $ = cheerio.load(globalContent); // Load the global content into Cheerio
|
||||
$('header, footer, script, style, svg, img, video').remove(); // Remove unnecessary elements
|
||||
$('*').each(function () {
|
||||
// For each element, remove all attributes
|
||||
const attributes = Object.keys(this.attribs);
|
||||
attributes.forEach(attr => {
|
||||
$(this).removeAttr(attr);
|
||||
});
|
||||
});
|
||||
const cleanedContent = $.html(); // Get the cleaned HTML
|
||||
|
||||
res.json({
|
||||
content: cleanedContent,
|
||||
length: cleanedContent.length
|
||||
});
|
||||
} catch (error) {
|
||||
res.status(500).json({ error: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
app.listen(port, () => {
|
||||
console.log(`Server running on http://localhost:${port}`);
|
||||
});
|
3675
crawler/package-lock.json
generated
3675
crawler/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -1,18 +0,0 @@
|
||||
{
|
||||
"name": "crawler",
|
||||
"version": "1.0.0",
|
||||
"description": "This template is a production ready boilerplate for developing with `PlaywrightCrawler`. Use this to bootstrap your projects using the most up-to-date code.",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"crawlee": "^3.7.2",
|
||||
"express": "^4.18.2",
|
||||
"playwright": "^1.41.1"
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user