mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
stopgap fix for problem where an attempt to save a screenshot of a url with a hash tag containing spaces or non-ascii characters would fail, causing the whole brozzle of the page to fail, and end up in a retry loop (better handling of hash tags is planned which will obviate this change)
This commit is contained in:
parent
5c684779e5
commit
ed2d58d87d
@ -64,12 +64,14 @@ class BaseDictable:
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
||||||
|
|
||||||
def fixup(url):
|
def fixup(url, hash_strip=False):
|
||||||
'''
|
'''
|
||||||
Does rudimentary canonicalization, such as converting IDN to punycode.
|
Does rudimentary canonicalization, such as converting IDN to punycode.
|
||||||
'''
|
'''
|
||||||
import surt
|
import surt
|
||||||
hurl = surt.handyurl.parse(url)
|
hurl = surt.handyurl.parse(url)
|
||||||
|
if hash_strip:
|
||||||
|
hurl.hash = None
|
||||||
# handyurl.parse() already lowercases the scheme via urlsplit
|
# handyurl.parse() already lowercases the scheme via urlsplit
|
||||||
if hurl.host:
|
if hurl.host:
|
||||||
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
|
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
|
||||||
|
@ -237,17 +237,20 @@ class BrozzlerWorker:
|
|||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_png)
|
||||||
elif self._proxy(site) and self._enable_warcprox_features(site):
|
elif self._proxy(site) and self._enable_warcprox_features(site):
|
||||||
self.logger.info("sending WARCPROX_WRITE_RECORD request "
|
self.logger.info(
|
||||||
"to warcprox with screenshot for %s", page)
|
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||||
|
"screenshot for %s", self._proxy(site), page)
|
||||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||||
screenshot_png)
|
screenshot_png)
|
||||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
self._warcprox_write_record(
|
||||||
url="screenshot:%s" % brozzler.fixup(page.url),
|
warcprox_address=self._proxy(site),
|
||||||
|
url="screenshot:%s" % brozzler.fixup(page.url, True),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=screenshot_jpeg,
|
payload=screenshot_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
self._warcprox_write_record(
|
||||||
url="thumbnail:%s" % brozzler.fixup(page.url),
|
warcprox_address=self._proxy(site),
|
||||||
|
url="thumbnail:%s" % brozzler.fixup(page.url, True),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=thumbnail_jpeg,
|
payload=thumbnail_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev178',
|
version='1.1b9.dev179',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user