resolve merge conflicts

This commit is contained in:
Adam Miller 2024-11-26 10:52:06 -08:00
commit f9f68caaba
10 changed files with 116 additions and 15 deletions

36
.github/workflows/publish-artifacts.yml vendored Normal file
View File

@ -0,0 +1,36 @@
name: Publish Artifacts
on:
push:
branches:
- main
- master
pull_request:
branches:
- main
- master
jobs:
build:
name: Build distribution 📦
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
python-version: "3.8"
- name: Install pypa/build
run: >-
python3 -m
pip install
build
--user
- name: Build a binary wheel and a source tarball
run: python3 -m build
- name: Store the distribution packages
uses: actions/upload-artifact@v4
with:
name: python-package-distributions
path: dist/

View File

@ -10,7 +10,7 @@
Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome Brozzler is a distributed web crawler (爬虫) that uses a real browser (Chrome
or Chromium) to fetch pages and embedded URLs and to extract links. It employs or Chromium) to fetch pages and embedded URLs and to extract links. It employs
`youtube-dl <https://github.com/rg3/youtube-dl>`_ to enhance media capture `yt-dlp <https://github.com/yt-dlp/yt-dlp>`_ (formerly youtube-dl) to enhance media capture
capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to capabilities and `rethinkdb <https://github.com/rethinkdb/rethinkdb>`_ to
manage crawl state. manage crawl state.
@ -190,7 +190,7 @@ this has not yet been extensively tested.
License License
------- -------
Copyright 2015-2018 Internet Archive Copyright 2015-2024 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); you may Licensed under the Apache License, Version 2.0 (the "License"); you may
not use this software except in compliance with the License. You may not use this software except in compliance with the License. You may

0
__init.py__ Normal file
View File

View File

@ -47,6 +47,10 @@ class ProxyError(Exception):
pass pass
class PageConnectionError(Exception):
pass
class ReachedTimeLimit(Exception): class ReachedTimeLimit(Exception):
pass pass

View File

@ -199,6 +199,7 @@ class Chrome:
"--disable-first-run-ui", "--disable-first-run-ui",
"--no-first-run", "--no-first-run",
"--homepage=about:blank", "--homepage=about:blank",
"--disable-features=HttpsUpgrades",
"--disable-direct-npapi-requests", "--disable-direct-npapi-requests",
"--disable-web-security", "--disable-web-security",
"--disable-notifications", "--disable-notifications",

View File

@ -138,7 +138,14 @@ class RethinkDbFrontier:
emit=lambda acc, site, new_acc: r.branch( emit=lambda acc, site, new_acc: r.branch(
r.and_( r.and_(
r.or_( r.or_(
site["claimed"].not_(), # Avoid tight loop when unclaimed site was recently disclaimed
r.and_(
site["claimed"].not_(),
r.or_(
site.has_fields("last_disclaimed").not_(),
site["last_disclaimed"].lt(r.now().sub(20)),
),
),
site["last_claimed"].lt(r.now().sub(60 * 60)), site["last_claimed"].lt(r.now().sub(60 * 60)),
), ),
r.or_( r.or_(
@ -218,6 +225,11 @@ class RethinkDbFrontier:
index="priority_by_site", index="priority_by_site",
) )
.order_by(index=r.desc("priority_by_site")) .order_by(index=r.desc("priority_by_site"))
.filter(
lambda page: r.or_(
page.has_fields("retry_after").not_(), r.now() > page["retry_after"]
)
)
.limit(1) .limit(1)
.update( .update(
{"claimed": True, "last_claimed_by": worker_id}, return_changes="always" {"claimed": True, "last_claimed_by": worker_id}, return_changes="always"

View File

@ -411,6 +411,10 @@ class Page(doublethink.Document):
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest() return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def populate_defaults(self): def populate_defaults(self):
if not "retry_after" in self:
self.retry_after = None
if not "failed_attempts" in self:
self.failed_attempts = 0
if not "hops_from_seed" in self: if not "hops_from_seed" in self:
self.hops_from_seed = 0 self.hops_from_seed = 0
if not "hop_path" in self: if not "hop_path" in self:

View File

@ -22,6 +22,7 @@ import logging
import brozzler import brozzler
import brozzler.browser import brozzler.browser
from brozzler.model import VideoCaptureOptions from brozzler.model import VideoCaptureOptions
import datetime
import threading import threading
import time import time
import urllib.request import urllib.request
@ -287,11 +288,14 @@ class BrozzlerWorker:
browser, site, page, on_screenshot, on_request browser, site, page, on_screenshot, on_request
) )
outlinks.update(browser_outlinks) outlinks.update(browser_outlinks)
status_code = browser.websock_thread.page_status
if status_code in [502, 504]:
raise brozzler.PageConnectionError()
except brozzler.PageInterstitialShown: except brozzler.PageInterstitialShown:
self.logger.info("page interstitial shown (http auth): %s", page) self.logger.info("page interstitial shown (http auth): %s", page)
if enable_youtube_dl and ydl.should_ytdlp( if enable_youtube_dl and ydl.should_ytdlp(
site, page, browser.websock_thread.page_status site, page, status_code
): ):
try: try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page) ydl_outlinks = ydl.do_youtube_dl(self, site, page)
@ -561,11 +565,25 @@ class BrozzlerWorker:
# using brozzler-worker --proxy, nothing to do but try the # using brozzler-worker --proxy, nothing to do but try the
# same proxy again next time # same proxy again next time
logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1) logging.error("proxy error (self._proxy=%r)", self._proxy, exc_info=1)
except: except (brozzler.PageConnectionError, Exception) as e:
self.logger.error( if isinstance(e, brozzler.PageConnectionError):
"unexpected exception site=%r page=%r", site, page, exc_info=True self.logger.error(
) "Page status code possibly indicates connection failure between host and warcprox: site=%r page=%r",
site,
page,
exc_info=True,
)
else:
self.logger.error(
"unexpected exception site=%r page=%r", site, page, exc_info=True
)
if page: if page:
# Calculate backoff in seconds based on number of failed attempts.
# Minimum of 60, max of 135 giving delays of 60, 90, 135, 135...
retry_delay = min(135, 60 * (1.5**page.failed_attempts))
page.retry_after = doublethink.utcnow() + datetime.timedelta(
seconds=retry_delay
)
page.failed_attempts = (page.failed_attempts or 0) + 1 page.failed_attempts = (page.failed_attempts or 0) + 1
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
self.logger.info( self.logger.info(
@ -575,7 +593,8 @@ class BrozzlerWorker:
page, page,
) )
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
page = None else:
page.save()
finally: finally:
if start: if start:
site.active_brozzling_time = ( site.active_brozzling_time = (

27
pyproject.toml Normal file
View File

@ -0,0 +1,27 @@
[project]
name = "brozzler"
authors = [
{ name="Noah Levitt", email="nlevitt@archive.org" },
]
maintainers = [
{ name="Vangelis Banos", email="vangelis@archive.org" },
{ name="Adam Miller", email="adam@archive.org" },
{ name="Barbara Miller", email="barbara@archive.org" },
{ name="Alex Dempsey", email="avdempsey@archive.org" },
]
description = "Distributed web crawling with browsers"
readme = "README.rst"
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
]
dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ]
[project.urls]
Homepage = "https://github.com/internetarchive/brozzler"
Issues = "https://github.com/internetarchive/brozzler/issues"
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"

View File

@ -34,7 +34,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name="brozzler", name="brozzler",
version="1.5.59b0", version="1.6.4",
description="Distributed web crawling with browsers", description="Distributed web crawling with browsers",
url="https://github.com/internetarchive/brozzler", url="https://github.com/internetarchive/brozzler",
author="Noah Levitt", author="Noah Levitt",
@ -72,8 +72,6 @@ setuptools.setup(
"websocket-client==1.8.0", "websocket-client==1.8.0",
"pillow>=5.2.0", "pillow>=5.2.0",
"urlcanon>=0.1.dev23", "urlcanon>=0.1.dev23",
"doublethink @ git+https://github.com/internetarchive/doublethink.git@Py311",
"rethinkdb==2.4.9",
"cerberus>=1.0.1", "cerberus>=1.0.1",
"jinja2>=2.10", "jinja2>=2.10",
"cryptography>=2.3", "cryptography>=2.3",
@ -88,6 +86,8 @@ setuptools.setup(
"pywb>=0.33.2,<2", "pywb>=0.33.2,<2",
"flask>=1.0", "flask>=1.0",
"gunicorn>=19.8.1", "gunicorn>=19.8.1",
"rethinkdb==2.4.9",
"doublethink==0.4.9",
], ],
}, },
zip_safe=False, zip_safe=False,
@ -95,9 +95,7 @@ setuptools.setup(
"Development Status :: 5 - Production/Stable", "Development Status :: 5 - Production/Stable",
"Environment :: Console", "Environment :: Console",
"License :: OSI Approved :: Apache Software License", "License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Topic :: Internet :: WWW/HTTP", "Topic :: Internet :: WWW/HTTP",
"Topic :: System :: Archiving", "Topic :: System :: Archiving",
], ],