catch exceptions parsing funky urls when scoping and extracting outlinks

This commit is contained in:
Noah Levitt 2017-01-09 15:18:19 -08:00
parent 2486768830
commit 32097a8f8b
2 changed files with 13 additions and 8 deletions

View File

@ -39,12 +39,15 @@ class Url:
@property
def surt(self):
if not self._surt:
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
# XXX chop off path after last slash??
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
try:
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
# XXX chop off path after last slash??
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
except Exception as e:
logging.warn('problem surting %s - %s', repr(self.url), e)
return self._surt
@property
@ -149,7 +152,9 @@ class Site(brozzler.BaseDictable):
u = url
might_accept = False
if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
if not u.surt:
return False
elif not u.surt.startswith("http://") and not u.surt.startswith("https://"):
# XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?)
return False

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev160',
version='1.1b9.dev161',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',