stopgap fix for problem where an attempt to save a screenshot of a url with a hash tag containing spaces or non-ascii characters would fail, causing the whole brozzle of the page to fail, and end up in a retry loop (better handling of hash tags is planned which will obviate this change)

This commit is contained in:
Noah Levitt 2017-02-01 22:39:12 +00:00
parent 5c684779e5
commit ed2d58d87d
3 changed files with 13 additions and 8 deletions

View File

@ -64,12 +64,14 @@ class BaseDictable:
def __repr__(self): def __repr__(self):
return "{}(**{})".format(self.__class__.__name__, self.to_dict()) return "{}(**{})".format(self.__class__.__name__, self.to_dict())
def fixup(url): def fixup(url, hash_strip=False):
''' '''
Does rudimentary canonicalization, such as converting IDN to punycode. Does rudimentary canonicalization, such as converting IDN to punycode.
''' '''
import surt import surt
hurl = surt.handyurl.parse(url) hurl = surt.handyurl.parse(url)
if hash_strip:
hurl.hash = None
# handyurl.parse() already lowercases the scheme via urlsplit # handyurl.parse() already lowercases the scheme via urlsplit
if hurl.host: if hurl.host:
hurl.host = hurl.host.encode('idna').decode('ascii').lower() hurl.host = hurl.host.encode('idna').decode('ascii').lower()

View File

@ -237,17 +237,20 @@ class BrozzlerWorker:
if on_screenshot: if on_screenshot:
on_screenshot(screenshot_png) on_screenshot(screenshot_png)
elif self._proxy(site) and self._enable_warcprox_features(site): elif self._proxy(site) and self._enable_warcprox_features(site):
self.logger.info("sending WARCPROX_WRITE_RECORD request " self.logger.info(
"to warcprox with screenshot for %s", page) "sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy(site), page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
screenshot_png) screenshot_png)
self._warcprox_write_record(warcprox_address=self._proxy(site), self._warcprox_write_record(
url="screenshot:%s" % brozzler.fixup(page.url), warcprox_address=self._proxy(site),
url="screenshot:%s" % brozzler.fixup(page.url, True),
warc_type="resource", content_type="image/jpeg", warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg, payload=screenshot_jpeg,
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())
self._warcprox_write_record(warcprox_address=self._proxy(site), self._warcprox_write_record(
url="thumbnail:%s" % brozzler.fixup(page.url), warcprox_address=self._proxy(site),
url="thumbnail:%s" % brozzler.fixup(page.url, True),
warc_type="resource", content_type="image/jpeg", warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg, payload=thumbnail_jpeg,
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev178', version='1.1b9.dev179',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',