mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
catch exceptions parsing funky urls when scoping and extracting outlinks
This commit is contained in:
parent
2486768830
commit
32097a8f8b
@ -39,12 +39,15 @@ class Url:
|
|||||||
@property
|
@property
|
||||||
def surt(self):
|
def surt(self):
|
||||||
if not self._surt:
|
if not self._surt:
|
||||||
hurl = surt.handyurl.parse(self.url)
|
try:
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
hurl = surt.handyurl.parse(self.url)
|
||||||
hurl.query = None
|
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||||
hurl.hash = None
|
hurl.query = None
|
||||||
# XXX chop off path after last slash??
|
hurl.hash = None
|
||||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
# XXX chop off path after last slash??
|
||||||
|
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warn('problem surting %s - %s', repr(self.url), e)
|
||||||
return self._surt
|
return self._surt
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@ -149,7 +152,9 @@ class Site(brozzler.BaseDictable):
|
|||||||
u = url
|
u = url
|
||||||
|
|
||||||
might_accept = False
|
might_accept = False
|
||||||
if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
if not u.surt:
|
||||||
|
return False
|
||||||
|
elif not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
# schemes?)
|
# schemes?)
|
||||||
return False
|
return False
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev160',
|
version='1.1b9.dev161',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user