diff --git a/.github/workflows/publish-artifacts.yml b/.github/workflows/publish-artifacts.yml new file mode 100644 index 0000000..47c96a3 --- /dev/null +++ b/.github/workflows/publish-artifacts.yml @@ -0,0 +1,36 @@ +name: Publish Artifacts + +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +jobs: + build: + name: Build distribution 📦 + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: "3.8" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ diff --git a/README.rst b/README.rst index b2fe7f2..13ab3d4 100644 --- a/README.rst +++ b/README.rst @@ -10,7 +10,7 @@ Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome or Chromium) to fetch pages and embedded URLs and to extract links. It employs -`youtube-dl `_ to enhance media capture +`yt-dlp `_ (formerly youtube-dl) to enhance media capture capabilities and `rethinkdb `_ to manage crawl state. @@ -190,7 +190,7 @@ this has not yet been extensively tested. License ------- -Copyright 2015-2018 Internet Archive +Copyright 2015-2024 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this software except in compliance with the License. You may diff --git a/__init.py__ b/__init.py__ new file mode 100644 index 0000000..e69de29 diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 7dd284d..9309116 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -47,6 +47,10 @@ class ProxyError(Exception): pass +class PageConnectionError(Exception): + pass + + class ReachedTimeLimit(Exception): pass diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 927d566..d47b0a5 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -199,6 +199,7 @@ class Chrome: "--disable-first-run-ui", "--no-first-run", "--homepage=about:blank", + "--disable-features=HttpsUpgrades", "--disable-direct-npapi-requests", "--disable-web-security", "--disable-notifications", diff --git a/brozzler/frontier.py b/brozzler/frontier.py index afb2a57..c6ac971 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -138,7 +138,14 @@ class RethinkDbFrontier: emit=lambda acc, site, new_acc: r.branch( r.and_( r.or_( - site["claimed"].not_(), + # Avoid tight loop when unclaimed site was recently disclaimed + r.and_( + site["claimed"].not_(), + r.or_( + site.has_fields("last_disclaimed").not_(), + site["last_disclaimed"].lt(r.now().sub(20)), + ), + ), site["last_claimed"].lt(r.now().sub(60 * 60)), ), r.or_( @@ -218,6 +225,11 @@ class RethinkDbFrontier: index="priority_by_site", ) .order_by(index=r.desc("priority_by_site")) + .filter( + lambda page: r.or_( + page.has_fields("retry_after").not_(), r.now() > page["retry_after"] + ) + ) .limit(1) .update( {"claimed": True, "last_claimed_by": worker_id}, return_changes="always" diff --git a/brozzler/model.py b/brozzler/model.py index fbdd6c7..608847c 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -411,6 +411,10 @@ class Page(doublethink.Document): return hashlib.sha1(digest_this.encode("utf-8")).hexdigest() def populate_defaults(self): + if not "retry_after" in self: + self.retry_after = None + if not "failed_attempts" in self: + self.failed_attempts = 0 if not "hops_from_seed" in self: self.hops_from_seed = 0 if not "hop_path" in self: diff --git a/brozzler/worker.py b/brozzler/worker.py index 511cbd4..0c9b9bb 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -22,6 +22,7 @@ import logging import brozzler import brozzler.browser from brozzler.model import VideoCaptureOptions +import datetime import threading import time import urllib.request @@ -287,11 +288,14 @@ class BrozzlerWorker: browser, site, page, on_screenshot, on_request ) outlinks.update(browser_outlinks) + status_code = browser.websock_thread.page_status + if status_code in [502, 504]: + raise brozzler.PageConnectionError() except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) if enable_youtube_dl and ydl.should_ytdlp( - site, page, browser.websock_thread.page_status + site, page, status_code ): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) @@ -561,11 +565,25 @@ class BrozzlerWorker: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1) - except: - self.logger.error( - "unexpected exception site=%r page=%r", site, page, exc_info=True - ) + except (brozzler.PageConnectionError, Exception) as e: + if isinstance(e, brozzler.PageConnectionError): + self.logger.error( + "Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r", + site, + page, + exc_info=True, + ) + else: + self.logger.error( + "unexpected exception site=%r page=%r", site, page, exc_info=True + ) if page: + # Calculate backoff in seconds based on number of failed attempts. + # Minimum of 60, max of 135 giving delays of 60, 90, 135, 135... + retry_delay = min(135, 60 * (1.5**page.failed_attempts)) + page.retry_after = doublethink.utcnow() + datetime.timedelta( + seconds=retry_delay + ) page.failed_attempts = (page.failed_attempts or 0) + 1 if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: self.logger.info( @@ -575,7 +593,8 @@ class BrozzlerWorker: page, ) self._frontier.completed_page(site, page) - page = None + else: + page.save() finally: if start: site.active_brozzling_time = ( diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9880da2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "brozzler" +authors = [ + { name="Noah Levitt", email="nlevitt@archive.org" }, +] +maintainers = [ + { name="Vangelis Banos", email="vangelis@archive.org" }, + { name="Adam Miller", email="adam@archive.org" }, + { name="Barbara Miller", email="barbara@archive.org" }, + { name="Alex Dempsey", email="avdempsey@archive.org" }, +] +description = "Distributed web crawling with browsers" +readme = "README.rst" +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", +] +dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ] + +[project.urls] +Homepage = "https://github.com/internetarchive/brozzler" +Issues = "https://github.com/internetarchive/brozzler/issues" +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 34d036c..fc84694 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.59b0", + version="1.6.4", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", @@ -72,8 +72,6 @@ setuptools.setup( "websocket-client==1.8.0", "pillow>=5.2.0", "urlcanon>=0.1.dev23", - "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311", - "rethinkdb==2.4.9", "cerberus>=1.0.1", "jinja2>=2.10", "cryptography>=2.3", @@ -88,6 +86,8 @@ setuptools.setup( "pywb>=0.33.2,<2", "flask>=1.0", "gunicorn>=19.8.1", + "rethinkdb==2.4.9", + "doublethink==0.4.9", ], }, zip_safe=False, @@ -95,9 +95,7 @@ setuptools.setup( "Development Status :: 5 - Production/Stable", "Environment :: Console", "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Topic :: Internet :: WWW/HTTP", "Topic :: System :: Archiving", ],