stopgap fix for problem where an attempt to save a screenshot of a url with a hash tag containing spaces or non-ascii characters would fail, causing the whole brozzle of the page to fail, and end up in a retry loop (better handling of hash tags is planned which will obviate this change)

This commit is contained in:
Noah Levitt 2017-02-01 22:39:12 +00:00
parent 5c684779e5
commit ed2d58d87d
3 changed files with 13 additions and 8 deletions

View File

@ -64,12 +64,14 @@ class BaseDictable:
def __repr__(self):
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
def fixup(url):
def fixup(url, hash_strip=False):
'''
Does rudimentary canonicalization, such as converting IDN to punycode.
'''
import surt
hurl = surt.handyurl.parse(url)
if hash_strip:
hurl.hash = None
# handyurl.parse() already lowercases the scheme via urlsplit
if hurl.host:
hurl.host = hurl.host.encode('idna').decode('ascii').lower()

View File

@ -237,17 +237,20 @@ class BrozzlerWorker:
if on_screenshot:
on_screenshot(screenshot_png)
elif self._proxy(site) and self._enable_warcprox_features(site):
self.logger.info("sending WARCPROX_WRITE_RECORD request "
"to warcprox with screenshot for %s", page)
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy(site), page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
screenshot_png)
self._warcprox_write_record(warcprox_address=self._proxy(site),
url="screenshot:%s" % brozzler.fixup(page.url),
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="screenshot:%s" % brozzler.fixup(page.url, True),
warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg,
extra_headers=site.extra_headers())
self._warcprox_write_record(warcprox_address=self._proxy(site),
url="thumbnail:%s" % brozzler.fixup(page.url),
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="thumbnail:%s" % brozzler.fixup(page.url, True),
warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg,
extra_headers=site.extra_headers())

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev178',
version='1.1b9.dev179',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',