mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
catch exceptions parsing funky urls when scoping and extracting outlinks
This commit is contained in:
parent
2486768830
commit
32097a8f8b
@ -39,12 +39,15 @@ class Url:
|
||||
@property
|
||||
def surt(self):
|
||||
if not self._surt:
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
# XXX chop off path after last slash??
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
try:
|
||||
hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
# XXX chop off path after last slash??
|
||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||
except Exception as e:
|
||||
logging.warn('problem surting %s - %s', repr(self.url), e)
|
||||
return self._surt
|
||||
|
||||
@property
|
||||
@ -149,7 +152,9 @@ class Site(brozzler.BaseDictable):
|
||||
u = url
|
||||
|
||||
might_accept = False
|
||||
if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
||||
if not u.surt:
|
||||
return False
|
||||
elif not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||
# schemes?)
|
||||
return False
|
||||
|
Loading…
x
Reference in New Issue
Block a user