mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
resolve merge conflicts
This commit is contained in:
commit
f9f68caaba
36
.github/workflows/publish-artifacts.yml
vendored
Normal file
36
.github/workflows/publish-artifacts.yml
vendored
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
name: Publish Artifacts
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
name: Build distribution 📦
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set up Python 3.8
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.8"
|
||||||
|
- name: Install pypa/build
|
||||||
|
run: >-
|
||||||
|
python3 -m
|
||||||
|
pip install
|
||||||
|
build
|
||||||
|
--user
|
||||||
|
- name: Build a binary wheel and a source tarball
|
||||||
|
run: python3 -m build
|
||||||
|
- name: Store the distribution packages
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: python-package-distributions
|
||||||
|
path: dist/
|
@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome
|
Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome
|
||||||
or Chromium) to fetch pages and embedded URLs and to extract links. It employs
|
or Chromium) to fetch pages and embedded URLs and to extract links. It employs
|
||||||
`youtube-dl <https://github.com/rg3/youtube-dl>`_ to enhance media capture
|
`yt-dlp <https://github.com/yt-dlp/yt-dlp>`_ (formerly youtube-dl) to enhance media capture
|
||||||
capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
|
capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
|
||||||
manage crawl state.
|
manage crawl state.
|
||||||
|
|
||||||
@ -190,7 +190,7 @@ this has not yet been extensively tested.
|
|||||||
License
|
License
|
||||||
-------
|
-------
|
||||||
|
|
||||||
Copyright 2015-2018 Internet Archive
|
Copyright 2015-2024 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
not use this software except in compliance with the License. You may
|
not use this software except in compliance with the License. You may
|
||||||
|
0
__init.py__
Normal file
0
__init.py__
Normal file
@ -47,6 +47,10 @@ class ProxyError(Exception):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PageConnectionError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ReachedTimeLimit(Exception):
|
class ReachedTimeLimit(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -199,6 +199,7 @@ class Chrome:
|
|||||||
"--disable-first-run-ui",
|
"--disable-first-run-ui",
|
||||||
"--no-first-run",
|
"--no-first-run",
|
||||||
"--homepage=about:blank",
|
"--homepage=about:blank",
|
||||||
|
"--disable-features=HttpsUpgrades",
|
||||||
"--disable-direct-npapi-requests",
|
"--disable-direct-npapi-requests",
|
||||||
"--disable-web-security",
|
"--disable-web-security",
|
||||||
"--disable-notifications",
|
"--disable-notifications",
|
||||||
|
@ -138,7 +138,14 @@ class RethinkDbFrontier:
|
|||||||
emit=lambda acc, site, new_acc: r.branch(
|
emit=lambda acc, site, new_acc: r.branch(
|
||||||
r.and_(
|
r.and_(
|
||||||
r.or_(
|
r.or_(
|
||||||
site["claimed"].not_(),
|
# Avoid tight loop when unclaimed site was recently disclaimed
|
||||||
|
r.and_(
|
||||||
|
site["claimed"].not_(),
|
||||||
|
r.or_(
|
||||||
|
site.has_fields("last_disclaimed").not_(),
|
||||||
|
site["last_disclaimed"].lt(r.now().sub(20)),
|
||||||
|
),
|
||||||
|
),
|
||||||
site["last_claimed"].lt(r.now().sub(60 * 60)),
|
site["last_claimed"].lt(r.now().sub(60 * 60)),
|
||||||
),
|
),
|
||||||
r.or_(
|
r.or_(
|
||||||
@ -218,6 +225,11 @@ class RethinkDbFrontier:
|
|||||||
index="priority_by_site",
|
index="priority_by_site",
|
||||||
)
|
)
|
||||||
.order_by(index=r.desc("priority_by_site"))
|
.order_by(index=r.desc("priority_by_site"))
|
||||||
|
.filter(
|
||||||
|
lambda page: r.or_(
|
||||||
|
page.has_fields("retry_after").not_(), r.now() > page["retry_after"]
|
||||||
|
)
|
||||||
|
)
|
||||||
.limit(1)
|
.limit(1)
|
||||||
.update(
|
.update(
|
||||||
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
|
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
|
||||||
|
@ -411,6 +411,10 @@ class Page(doublethink.Document):
|
|||||||
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
|
if not "retry_after" in self:
|
||||||
|
self.retry_after = None
|
||||||
|
if not "failed_attempts" in self:
|
||||||
|
self.failed_attempts = 0
|
||||||
if not "hops_from_seed" in self:
|
if not "hops_from_seed" in self:
|
||||||
self.hops_from_seed = 0
|
self.hops_from_seed = 0
|
||||||
if not "hop_path" in self:
|
if not "hop_path" in self:
|
||||||
|
@ -22,6 +22,7 @@ import logging
|
|||||||
import brozzler
|
import brozzler
|
||||||
import brozzler.browser
|
import brozzler.browser
|
||||||
from brozzler.model import VideoCaptureOptions
|
from brozzler.model import VideoCaptureOptions
|
||||||
|
import datetime
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import urllib.request
|
import urllib.request
|
||||||
@ -287,11 +288,14 @@ class BrozzlerWorker:
|
|||||||
browser, site, page, on_screenshot, on_request
|
browser, site, page, on_screenshot, on_request
|
||||||
)
|
)
|
||||||
outlinks.update(browser_outlinks)
|
outlinks.update(browser_outlinks)
|
||||||
|
status_code = browser.websock_thread.page_status
|
||||||
|
if status_code in [502, 504]:
|
||||||
|
raise brozzler.PageConnectionError()
|
||||||
except brozzler.PageInterstitialShown:
|
except brozzler.PageInterstitialShown:
|
||||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||||
|
|
||||||
if enable_youtube_dl and ydl.should_ytdlp(
|
if enable_youtube_dl and ydl.should_ytdlp(
|
||||||
site, page, browser.websock_thread.page_status
|
site, page, status_code
|
||||||
):
|
):
|
||||||
try:
|
try:
|
||||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
@ -561,11 +565,25 @@ class BrozzlerWorker:
|
|||||||
# using brozzler-worker --proxy, nothing to do but try the
|
# using brozzler-worker --proxy, nothing to do but try the
|
||||||
# same proxy again next time
|
# same proxy again next time
|
||||||
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
|
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
|
||||||
except:
|
except (brozzler.PageConnectionError, Exception) as e:
|
||||||
self.logger.error(
|
if isinstance(e, brozzler.PageConnectionError):
|
||||||
"unexpected exception site=%r page=%r", site, page, exc_info=True
|
self.logger.error(
|
||||||
)
|
"Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r",
|
||||||
|
site,
|
||||||
|
page,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.logger.error(
|
||||||
|
"unexpected exception site=%r page=%r", site, page, exc_info=True
|
||||||
|
)
|
||||||
if page:
|
if page:
|
||||||
|
# Calculate backoff in seconds based on number of failed attempts.
|
||||||
|
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
|
||||||
|
retry_delay = min(135, 60 * (1.5**page.failed_attempts))
|
||||||
|
page.retry_after = doublethink.utcnow() + datetime.timedelta(
|
||||||
|
seconds=retry_delay
|
||||||
|
)
|
||||||
page.failed_attempts = (page.failed_attempts or 0) + 1
|
page.failed_attempts = (page.failed_attempts or 0) + 1
|
||||||
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
@ -575,7 +593,8 @@ class BrozzlerWorker:
|
|||||||
page,
|
page,
|
||||||
)
|
)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
page = None
|
else:
|
||||||
|
page.save()
|
||||||
finally:
|
finally:
|
||||||
if start:
|
if start:
|
||||||
site.active_brozzling_time = (
|
site.active_brozzling_time = (
|
||||||
|
27
pyproject.toml
Normal file
27
pyproject.toml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
[project]
|
||||||
|
name = "brozzler"
|
||||||
|
authors = [
|
||||||
|
{ name="Noah Levitt", email="nlevitt@archive.org" },
|
||||||
|
]
|
||||||
|
maintainers = [
|
||||||
|
{ name="Vangelis Banos", email="vangelis@archive.org" },
|
||||||
|
{ name="Adam Miller", email="adam@archive.org" },
|
||||||
|
{ name="Barbara Miller", email="barbara@archive.org" },
|
||||||
|
{ name="Alex Dempsey", email="avdempsey@archive.org" },
|
||||||
|
]
|
||||||
|
description = "Distributed web crawling with browsers"
|
||||||
|
readme = "README.rst"
|
||||||
|
requires-python = ">=3.8"
|
||||||
|
classifiers = [
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
]
|
||||||
|
dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/internetarchive/brozzler"
|
||||||
|
Issues = "https://github.com/internetarchive/brozzler/issues"
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
10
setup.py
10
setup.py
@ -34,7 +34,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="brozzler",
|
name="brozzler",
|
||||||
version="1.5.59b0",
|
version="1.6.4",
|
||||||
description="Distributed web crawling with browsers",
|
description="Distributed web crawling with browsers",
|
||||||
url="https://github.com/internetarchive/brozzler",
|
url="https://github.com/internetarchive/brozzler",
|
||||||
author="Noah Levitt",
|
author="Noah Levitt",
|
||||||
@ -72,8 +72,6 @@ setuptools.setup(
|
|||||||
"websocket-client==1.8.0",
|
"websocket-client==1.8.0",
|
||||||
"pillow>=5.2.0",
|
"pillow>=5.2.0",
|
||||||
"urlcanon>=0.1.dev23",
|
"urlcanon>=0.1.dev23",
|
||||||
"doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
|
|
||||||
"rethinkdb==2.4.9",
|
|
||||||
"cerberus>=1.0.1",
|
"cerberus>=1.0.1",
|
||||||
"jinja2>=2.10",
|
"jinja2>=2.10",
|
||||||
"cryptography>=2.3",
|
"cryptography>=2.3",
|
||||||
@ -88,6 +86,8 @@ setuptools.setup(
|
|||||||
"pywb>=0.33.2,<2",
|
"pywb>=0.33.2,<2",
|
||||||
"flask>=1.0",
|
"flask>=1.0",
|
||||||
"gunicorn>=19.8.1",
|
"gunicorn>=19.8.1",
|
||||||
|
"rethinkdb==2.4.9",
|
||||||
|
"doublethink==0.4.9",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
@ -95,9 +95,7 @@ setuptools.setup(
|
|||||||
"Development Status :: 5 - Production/Stable",
|
"Development Status :: 5 - Production/Stable",
|
||||||
"Environment :: Console",
|
"Environment :: Console",
|
||||||
"License :: OSI Approved :: Apache Software License",
|
"License :: OSI Approved :: Apache Software License",
|
||||||
"Programming Language :: Python :: 3.5",
|
"Programming Language :: Python :: 3.8",
|
||||||
"Programming Language :: Python :: 3.6",
|
|
||||||
"Programming Language :: Python :: 3.7",
|
|
||||||
"Topic :: Internet :: WWW/HTTP",
|
"Topic :: Internet :: WWW/HTTP",
|
||||||
"Topic :: System :: Archiving",
|
"Topic :: System :: Archiving",
|
||||||
],
|
],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user