delete crawler in favor of jina-ai reader

This commit is contained in:
pluja 2024-04-22 12:08:54 +02:00
parent c510868e38
commit 483ba8b415
6 changed files with 0 additions and 3805 deletions

8
crawler/.gitignore vendored
View File

@ -1,8 +0,0 @@
# This file tells Git which files shouldn't be added to source control
.idea
dist
node_modules
apify_storage
crawlee_storage
storage

View File

@ -1,29 +0,0 @@
# Specify the base Docker image. You can read more about
# the available images at https://crawlee.dev/docs/guides/docker-images
# You can also use any other image from Docker Hub.
FROM apify/actor-node-playwright-chrome:18
# Copy just package.json and package-lock.json
# to speed up the build using Docker layer cache.
COPY --chown=myuser package*.json ./
# Install NPM packages, skip optional and development dependencies to
# keep the image small. Avoid logging too much and print the dependency
# tree for debugging
RUN npm --quiet set progress=false \
&& npm install --omit=dev --omit=optional \
&& echo "Installed NPM packages:" \
&& (npm list --omit=dev --all || true) \
&& echo "Node.js version:" \
&& node --version \
&& echo "NPM version:" \
&& npm --version
# Next, copy the remaining files and directories with the source code.
# Since we do this after NPM install, quick build will be really fast
# for most source file changes.
COPY --chown=myuser . ./
# Run the image.
CMD node index.js

View File

@ -1,24 +0,0 @@
# Crawlee + PlaywrightCrawler + JavaScript
Crawler API for kycnot.me
## `GET /scrap`
### URL Query Parameters:
- `url`: URL to scrap
### Response:
```json
{
"content": string,
"length": number,
}
```
### Example:
```bash
curl -X GET "http://localhost:3011/scrap?url=https://localmonero.co/nojs/faq"
```

View File

@ -1,51 +0,0 @@
const express = require('express');
const { PlaywrightCrawler } = require('crawlee');
const cheerio = require('cheerio');
const app = express();
const port = 3011;
let globalContent = "";
const crawler = new PlaywrightCrawler({
async requestHandler({ page }) {
const content = await page.innerHTML('body');
globalContent = content;
return content;
}
});
app.get('/scrap', async (req, res) => {
const url = req.query.url;
if (!url) {
return res.status(400).json({ error: "No URL provided" });
}
try {
await crawler.run([url]);
const $ = cheerio.load(globalContent); // Load the global content into Cheerio
$('header, footer, script, style, svg, img, video').remove(); // Remove unnecessary elements
$('*').each(function () {
// For each element, remove all attributes
const attributes = Object.keys(this.attribs);
attributes.forEach(attr => {
$(this).removeAttr(attr);
});
});
const cleanedContent = $.html(); // Get the cleaned HTML
res.json({
content: cleanedContent,
length: cleanedContent.length
});
} catch (error) {
res.status(500).json({ error: error.message });
}
});
app.listen(port, () => {
console.log(`Server running on http://localhost:${port}`);
});

3675
crawler/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,18 +0,0 @@
{
"name": "crawler",
"version": "1.0.0",
"description": "This template is a production ready boilerplate for developing with `PlaywrightCrawler`. Use this to bootstrap your projects using the most up-to-date code.",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"cheerio": "^1.0.0-rc.12",
"crawlee": "^3.7.2",
"express": "^4.18.2",
"playwright": "^1.41.1"
}
}