resolve merge conflicts

2025-07-09 16:19:37 -04:00 · 2024-11-26 10:52:06 -08:00 · 2024-11-26 10:52:06 -08:00 · f9f68caaba
commit f9f68caaba
parent 18c7cf5a3f cb355d9159
10 changed files with 116 additions and 15 deletions
--- a/.github/workflows/publish-artifacts.yml
+++ b/.github/workflows/publish-artifacts.yml
@ -0,0 +1,36 @@
 name: Publish Artifacts
 on:
  push:
    branches:
      - main
      - master
  pull_request:
    branches:
      - main
      - master
 jobs:
  build:
    name: Build distribution 📦
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python 3.8
        uses: actions/setup-python@v5
        with:
          python-version: "3.8"
      - name: Install pypa/build
        run: >-
          python3 -m
          pip install
          build
          --user
      - name: Build a binary wheel and a source tarball
        run: python3 -m build
      - name: Store the distribution packages
        uses: actions/upload-artifact@v4
        with:
          name: python-package-distributions
          path: dist/
--- a/README.rst
+++ b/README.rst
@ -10,7 +10,7 @@
 Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome
 or Chromium) to fetch pages and embedded URLs and to extract links. It employs
-`youtube-dl <https://github.com/rg3/youtube-dl>`_ to enhance media capture
+`yt-dlp <https://github.com/yt-dlp/yt-dlp>`_ (formerly youtube-dl) to enhance media capture
 capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
 manage crawl state.
@ -190,7 +190,7 @@ this has not yet been extensively tested.
 License
 -------
-Copyright 2015-2018 Internet Archive
+Copyright 2015-2024 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License"); you may
 not use this software except in compliance with the License. You may
--- a/init.py
+++ b/init.py
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -47,6 +47,10 @@ class ProxyError(Exception):
    pass
 class PageConnectionError(Exception):
    pass
 class ReachedTimeLimit(Exception):
    pass
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -199,6 +199,7 @@ class Chrome:
            "--disable-first-run-ui",
            "--no-first-run",
            "--homepage=about:blank",
            "--disable-features=HttpsUpgrades",
            "--disable-direct-npapi-requests",
            "--disable-web-security",
            "--disable-notifications",
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -138,7 +138,14 @@ class RethinkDbFrontier:
                        emit=lambda acc, site, new_acc: r.branch(
                            r.and_(
                                r.or_(
-                                    site["claimed"].not_(),
+                                    # Avoid tight loop when unclaimed site was recently disclaimed
                                    r.and_(
                                        site["claimed"].not_(),
                                        r.or_(
                                            site.has_fields("last_disclaimed").not_(),
                                            site["last_disclaimed"].lt(r.now().sub(20)),
                                        ),
                                    ),
                                    site["last_claimed"].lt(r.now().sub(60 * 60)),
                                ),
                                r.or_(
@ -218,6 +225,11 @@ class RethinkDbFrontier:
                index="priority_by_site",
            )
            .order_by(index=r.desc("priority_by_site"))
            .filter(
                lambda page: r.or_(
                    page.has_fields("retry_after").not_(), r.now() > page["retry_after"]
                )
            )
            .limit(1)
            .update(
                {"claimed": True, "last_claimed_by": worker_id}, return_changes="always"
--- a/brozzler/model.py
+++ b/brozzler/model.py
@ -411,6 +411,10 @@ class Page(doublethink.Document):
        return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
    def populate_defaults(self):
        if not "retry_after" in self:
            self.retry_after = None
        if not "failed_attempts" in self:
            self.failed_attempts = 0
        if not "hops_from_seed" in self:
            self.hops_from_seed = 0
        if not "hop_path" in self:
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -22,6 +22,7 @@ import logging
 import brozzler
 import brozzler.browser
 from brozzler.model import VideoCaptureOptions
 import datetime
 import threading
 import time
 import urllib.request
@ -287,11 +288,14 @@ class BrozzlerWorker:
                    browser, site, page, on_screenshot, on_request
                )
                outlinks.update(browser_outlinks)
                status_code = browser.websock_thread.page_status
                if status_code in [502, 504]:
                    raise brozzler.PageConnectionError()
            except brozzler.PageInterstitialShown:
                self.logger.info("page interstitial shown (http auth): %s", page)
            if enable_youtube_dl and ydl.should_ytdlp(
-                site, page, browser.websock_thread.page_status
+                site, page, status_code
            ):
                try:
                    ydl_outlinks = ydl.do_youtube_dl(self, site, page)
@ -561,11 +565,25 @@ class BrozzlerWorker:
                # using brozzler-worker --proxy, nothing to do but try the
                # same proxy again next time
                logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
-        except:
+        except (brozzler.PageConnectionError, Exception) as e:
-            self.logger.error(
+            if isinstance(e, brozzler.PageConnectionError):
-                "unexpected exception site=%r page=%r", site, page, exc_info=True
+                self.logger.error(
-            )
+                    "Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r",
                    site,
                    page,
                    exc_info=True,
                )
            else:
                self.logger.error(
                    "unexpected exception site=%r page=%r", site, page, exc_info=True
                )
            if page:
                # Calculate backoff in seconds based on number of failed attempts.
                # Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
                retry_delay = min(135, 60 * (1.5**page.failed_attempts))
                page.retry_after = doublethink.utcnow() + datetime.timedelta(
                    seconds=retry_delay
                )
                page.failed_attempts = (page.failed_attempts or 0) + 1
                if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
                    self.logger.info(
@ -575,7 +593,8 @@ class BrozzlerWorker:
                        page,
                    )
                    self._frontier.completed_page(site, page)
-                    page = None
+                else:
                    page.save()
        finally:
            if start:
                site.active_brozzling_time = (
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,27 @@
 [project]
 name = "brozzler"
 authors = [
  { name="Noah Levitt", email="nlevitt@archive.org" },
 ]
 maintainers = [
  { name="Vangelis Banos", email="vangelis@archive.org" },
  { name="Adam Miller", email="adam@archive.org" },
  { name="Barbara Miller", email="barbara@archive.org" },
  { name="Alex Dempsey", email="avdempsey@archive.org" },
 ]
 description = "Distributed web crawling with browsers"
 readme = "README.rst"
 requires-python = ">=3.8"
 classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: Apache Software License",
    "Operating System :: OS Independent",
 ]
 dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ]
 [project.urls]
 Homepage = "https://github.com/internetarchive/brozzler"
 Issues = "https://github.com/internetarchive/brozzler/issues"
 [build-system]
 requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,7 @@ def find_package_data(package):
 setuptools.setup(
    name="brozzler",
-    version="1.5.59b0",
+    version="1.6.4",
    description="Distributed web crawling with browsers",
    url="https://github.com/internetarchive/brozzler",
    author="Noah Levitt",
@ -72,8 +72,6 @@ setuptools.setup(
        "websocket-client==1.8.0",
        "pillow>=5.2.0",
        "urlcanon>=0.1.dev23",
        "doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
        "rethinkdb==2.4.9",
        "cerberus>=1.0.1",
        "jinja2>=2.10",
        "cryptography>=2.3",
@ -88,6 +86,8 @@ setuptools.setup(
            "pywb>=0.33.2,<2",
            "flask>=1.0",
            "gunicorn>=19.8.1",
            "rethinkdb==2.4.9",
            "doublethink==0.4.9",
        ],
    },
    zip_safe=False,
@ -95,9 +95,7 @@ setuptools.setup(
        "Development Status :: 5 - Production/Stable",
        "Environment :: Console",
        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Topic :: Internet :: WWW/HTTP",
        "Topic :: System :: Archiving",
    ],