mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-10 15:30:18 -04:00
catch exceptions parsing funky urls when scoping and extracting outlinks
This commit is contained in:
parent
2486768830
commit
32097a8f8b
2 changed files with 13 additions and 8 deletions
|
@ -39,12 +39,15 @@ class Url:
|
|||
@property
|
||||
def surt(self):
|
||||
if not self._surt:
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
# XXX chop off path after last slash??
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
try:
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
# XXX chop off path after last slash??
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
except Exception as e:
|
||||
logging.warn('problem surting %s - %s', repr(self.url), e)
|
||||
return self._surt
|
||||
|
||||
@property
|
||||
|
@ -149,7 +152,9 @@ class Site(brozzler.BaseDictable):
|
|||
u = url
|
||||
|
||||
might_accept = False
|
||||
if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
||||
if not u.surt:
|
||||
return False
|
||||
elif not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||
# schemes?)
|
||||
return False
|
||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev160',
|
||||
version='1.1b9.dev161',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue