mirror of
https://codeberg.org/pluja/kycnot.me
synced 2025-01-07 13:17:57 -05:00
delete crawler in favor of jina-ai reader
This commit is contained in:
parent
c510868e38
commit
483ba8b415
8
crawler/.gitignore
vendored
8
crawler/.gitignore
vendored
@ -1,8 +0,0 @@
|
|||||||
# This file tells Git which files shouldn't be added to source control
|
|
||||||
|
|
||||||
.idea
|
|
||||||
dist
|
|
||||||
node_modules
|
|
||||||
apify_storage
|
|
||||||
crawlee_storage
|
|
||||||
storage
|
|
@ -1,29 +0,0 @@
|
|||||||
# Specify the base Docker image. You can read more about
|
|
||||||
# the available images at https://crawlee.dev/docs/guides/docker-images
|
|
||||||
# You can also use any other image from Docker Hub.
|
|
||||||
FROM apify/actor-node-playwright-chrome:18
|
|
||||||
|
|
||||||
# Copy just package.json and package-lock.json
|
|
||||||
# to speed up the build using Docker layer cache.
|
|
||||||
COPY --chown=myuser package*.json ./
|
|
||||||
|
|
||||||
# Install NPM packages, skip optional and development dependencies to
|
|
||||||
# keep the image small. Avoid logging too much and print the dependency
|
|
||||||
# tree for debugging
|
|
||||||
RUN npm --quiet set progress=false \
|
|
||||||
&& npm install --omit=dev --omit=optional \
|
|
||||||
&& echo "Installed NPM packages:" \
|
|
||||||
&& (npm list --omit=dev --all || true) \
|
|
||||||
&& echo "Node.js version:" \
|
|
||||||
&& node --version \
|
|
||||||
&& echo "NPM version:" \
|
|
||||||
&& npm --version
|
|
||||||
|
|
||||||
# Next, copy the remaining files and directories with the source code.
|
|
||||||
# Since we do this after NPM install, quick build will be really fast
|
|
||||||
# for most source file changes.
|
|
||||||
COPY --chown=myuser . ./
|
|
||||||
|
|
||||||
|
|
||||||
# Run the image.
|
|
||||||
CMD node index.js
|
|
@ -1,24 +0,0 @@
|
|||||||
# Crawlee + PlaywrightCrawler + JavaScript
|
|
||||||
|
|
||||||
Crawler API for kycnot.me
|
|
||||||
|
|
||||||
## `GET /scrap`
|
|
||||||
|
|
||||||
### URL Query Parameters:
|
|
||||||
|
|
||||||
- `url`: URL to scrap
|
|
||||||
|
|
||||||
### Response:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"content": string,
|
|
||||||
"length": number,
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X GET "http://localhost:3011/scrap?url=https://localmonero.co/nojs/faq"
|
|
||||||
```
|
|
@ -1,51 +0,0 @@
|
|||||||
const express = require('express');
|
|
||||||
const { PlaywrightCrawler } = require('crawlee');
|
|
||||||
const cheerio = require('cheerio');
|
|
||||||
|
|
||||||
const app = express();
|
|
||||||
const port = 3011;
|
|
||||||
|
|
||||||
let globalContent = "";
|
|
||||||
|
|
||||||
const crawler = new PlaywrightCrawler({
|
|
||||||
async requestHandler({ page }) {
|
|
||||||
const content = await page.innerHTML('body');
|
|
||||||
globalContent = content;
|
|
||||||
return content;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
app.get('/scrap', async (req, res) => {
|
|
||||||
const url = req.query.url;
|
|
||||||
|
|
||||||
if (!url) {
|
|
||||||
return res.status(400).json({ error: "No URL provided" });
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
await crawler.run([url]);
|
|
||||||
|
|
||||||
const $ = cheerio.load(globalContent); // Load the global content into Cheerio
|
|
||||||
$('header, footer, script, style, svg, img, video').remove(); // Remove unnecessary elements
|
|
||||||
$('*').each(function () {
|
|
||||||
// For each element, remove all attributes
|
|
||||||
const attributes = Object.keys(this.attribs);
|
|
||||||
attributes.forEach(attr => {
|
|
||||||
$(this).removeAttr(attr);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
const cleanedContent = $.html(); // Get the cleaned HTML
|
|
||||||
|
|
||||||
res.json({
|
|
||||||
content: cleanedContent,
|
|
||||||
length: cleanedContent.length
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
res.status(500).json({ error: error.message });
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
|
|
||||||
app.listen(port, () => {
|
|
||||||
console.log(`Server running on http://localhost:${port}`);
|
|
||||||
});
|
|
3675
crawler/package-lock.json
generated
3675
crawler/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -1,18 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "crawler",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "This template is a production ready boilerplate for developing with `PlaywrightCrawler`. Use this to bootstrap your projects using the most up-to-date code.",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
|
||||||
},
|
|
||||||
"keywords": [],
|
|
||||||
"author": "",
|
|
||||||
"license": "ISC",
|
|
||||||
"dependencies": {
|
|
||||||
"cheerio": "^1.0.0-rc.12",
|
|
||||||
"crawlee": "^3.7.2",
|
|
||||||
"express": "^4.18.2",
|
|
||||||
"playwright": "^1.41.1"
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user