diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 9e5d5fc..ea15dfd 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -64,12 +64,14 @@ class BaseDictable: def __repr__(self): return "{}(**{})".format(self.__class__.__name__, self.to_dict()) -def fixup(url): +def fixup(url, hash_strip=False): ''' Does rudimentary canonicalization, such as converting IDN to punycode. ''' import surt hurl = surt.handyurl.parse(url) + if hash_strip: + hurl.hash = None # handyurl.parse() already lowercases the scheme via urlsplit if hurl.host: hurl.host = hurl.host.encode('idna').decode('ascii').lower() diff --git a/brozzler/worker.py b/brozzler/worker.py index 58a9079..f20ed86 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -237,17 +237,20 @@ class BrozzlerWorker: if on_screenshot: on_screenshot(screenshot_png) elif self._proxy(site) and self._enable_warcprox_features(site): - self.logger.info("sending WARCPROX_WRITE_RECORD request " - "to warcprox with screenshot for %s", page) + self.logger.info( + "sending WARCPROX_WRITE_RECORD request to %s with " + "screenshot for %s", self._proxy(site), page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) - self._warcprox_write_record(warcprox_address=self._proxy(site), - url="screenshot:%s" % brozzler.fixup(page.url), + self._warcprox_write_record( + warcprox_address=self._proxy(site), + url="screenshot:%s" % brozzler.fixup(page.url, True), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) - self._warcprox_write_record(warcprox_address=self._proxy(site), - url="thumbnail:%s" % brozzler.fixup(page.url), + self._warcprox_write_record( + warcprox_address=self._proxy(site), + url="thumbnail:%s" % brozzler.fixup(page.url, True), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) diff --git a/setup.py b/setup.py index 00b4713..249d026 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev178', + version='1.1b9.dev179', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',