From 32097a8f8bdc7eadfbeb2ff460121a6162940665 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 9 Jan 2017 15:18:19 -0800 Subject: [PATCH] catch exceptions parsing funky urls when scoping and extracting outlinks --- brozzler/site.py | 19 ++++++++++++------- setup.py | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/brozzler/site.py b/brozzler/site.py index d0c0f48..fe86f0b 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -39,12 +39,15 @@ class Url: @property def surt(self): if not self._surt: - hurl = surt.handyurl.parse(self.url) - surt.GoogleURLCanonicalizer.canonicalize(hurl) - hurl.query = None - hurl.hash = None - # XXX chop off path after last slash?? - self._surt = hurl.getURLString(surt=True, trailing_comma=True) + try: + hurl = surt.handyurl.parse(self.url) + surt.GoogleURLCanonicalizer.canonicalize(hurl) + hurl.query = None + hurl.hash = None + # XXX chop off path after last slash?? + self._surt = hurl.getURLString(surt=True, trailing_comma=True) + except Exception as e: + logging.warn('problem surting %s - %s', repr(self.url), e) return self._surt @property @@ -149,7 +152,9 @@ class Site(brozzler.BaseDictable): u = url might_accept = False - if not u.surt.startswith("http://") and not u.surt.startswith("https://"): + if not u.surt: + return False + elif not u.surt.startswith("http://") and not u.surt.startswith("https://"): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False diff --git a/setup.py b/setup.py index cebd8e6..308d90b 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev160', + version='1.1b9.dev161', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',