mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
punycode host part of url to avoid errors doing WARCPROX_WRITE_RECORD
This commit is contained in:
parent
f30c143c66
commit
5bd4908e1d
3 changed files with 16 additions and 4 deletions
|
@ -19,6 +19,7 @@ limitations under the License.
|
||||||
|
|
||||||
import json as _json
|
import json as _json
|
||||||
import logging as _logging
|
import logging as _logging
|
||||||
|
import surt as _surt
|
||||||
from pkg_resources import get_distribution as _get_distribution
|
from pkg_resources import get_distribution as _get_distribution
|
||||||
|
|
||||||
__version__ = _get_distribution('brozzler').version
|
__version__ = _get_distribution('brozzler').version
|
||||||
|
@ -64,6 +65,16 @@ class BaseDictable:
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
||||||
|
|
||||||
|
def fixup(url):
|
||||||
|
'''
|
||||||
|
Does rudimentary canonicalization, such as converting IDN to punycode.
|
||||||
|
'''
|
||||||
|
hurl = _surt.handyurl.parse(url)
|
||||||
|
# handyurl.parse() already lowercases the scheme via urlsplit
|
||||||
|
if hurl.host:
|
||||||
|
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
|
||||||
|
return hurl.getURLString()
|
||||||
|
|
||||||
# logging level more fine-grained than logging.DEBUG==10
|
# logging level more fine-grained than logging.DEBUG==10
|
||||||
TRACE = 5
|
TRACE = 5
|
||||||
|
|
||||||
|
|
|
@ -197,7 +197,8 @@ class BrozzlerWorker:
|
||||||
"with youtube-dl json for %s", page)
|
"with youtube-dl json for %s", page)
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy(site),
|
warcprox_address=self._proxy(site),
|
||||||
url="youtube-dl:%s" % page.url, warc_type="metadata",
|
url="youtube-dl:%s" % brozzler.fixup(page.url),
|
||||||
|
warc_type="metadata",
|
||||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||||
payload=info_json.encode("utf-8"),
|
payload=info_json.encode("utf-8"),
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
|
@ -237,12 +238,12 @@ class BrozzlerWorker:
|
||||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||||
screenshot_png)
|
screenshot_png)
|
||||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
||||||
url="screenshot:{}".format(page.url),
|
url="screenshot:%s" % brozzler.fixup(page.url),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=screenshot_jpeg,
|
payload=screenshot_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
||||||
url="thumbnail:{}".format(page.url),
|
url="thumbnail:%s" % brozzler.fixup(page.url),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=thumbnail_jpeg,
|
payload=thumbnail_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b7.dev108',
|
version='1.1b7.dev109',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue