use urlcanon library for canonicalization, surtification, scope match rules

This commit is contained in:
Noah Levitt 2017-03-15 14:59:51 -07:00
parent 479f0f7e09
commit 12fb9eaa15
11 changed files with 78 additions and 232 deletions

View File

@ -9,7 +9,7 @@ install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR pytest - pip install $TRAVIS_BUILD_DIR pytest
script: script:
- DISPLAY=:1 py.test -v -s tests - DISPLAY=:1 py.test -v tests
after_failure: after_failure:
- sudo cat /var/log/upstart/warcprox.log - sudo cat /var/log/upstart/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log - sudo cat /var/log/upstart/brozzler-worker.log

View File

@ -50,31 +50,18 @@ class ReachedLimit(Exception):
def __str__(self): def __str__(self):
return self.__repr__() return self.__repr__()
def fixup(url, hash_strip=False):
'''
Does rudimentary canonicalization, such as converting IDN to punycode.
'''
import surt
hurl = surt.handyurl.parse(url)
if hash_strip:
hurl.hash = None
# handyurl.parse() already lowercases the scheme via urlsplit
if hurl.host:
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
return hurl.getURLString()
# monkey-patch log level TRACE # monkey-patch log level TRACE
TRACE = 5 TRACE = 5
import logging as _logging import logging
def _logging_trace(msg, *args, **kwargs): def _logging_trace(msg, *args, **kwargs):
_logging.root.trace(msg, *args, **kwargs) logging.root.trace(msg, *args, **kwargs)
def _logger_trace(self, msg, *args, **kwargs): def _logger_trace(self, msg, *args, **kwargs):
if self.isEnabledFor(TRACE): if self.isEnabledFor(TRACE):
self._log(TRACE, msg, args, **kwargs) self._log(TRACE, msg, args, **kwargs)
_logging.trace = _logging_trace logging.trace = _logging_trace
_logging.Logger.trace = _logger_trace logging.Logger.trace = _logger_trace
_logging._levelToName[TRACE] = 'TRACE' logging._levelToName[TRACE] = 'TRACE'
_logging._nameToLevel['TRACE'] = TRACE logging._nameToLevel['TRACE'] = TRACE
_behaviors = None _behaviors = None
def behaviors(): def behaviors():
@ -158,6 +145,14 @@ def jinja2_environment():
_jinja2_env.filters['json'] = json.dumps _jinja2_env.filters['json'] = json.dumps
return _jinja2_env return _jinja2_env
import urlcanon
def _remove_query(url):
url.question_mark = b''
url.query = b''
# XXX chop off path after last slash??
site_surt_canon = urlcanon.Canonicalizer(
urlcanon.semantic.steps + [_remove_query])
from brozzler.site import Page, Site from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots from brozzler.robots import is_permitted_by_robots
@ -166,3 +161,6 @@ from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.job import new_job, new_site, Job from brozzler.job import new_job, new_site, Job
from brozzler.cli import suggest_default_chrome_exe from brozzler.cli import suggest_default_chrome_exe
__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
'new_job', 'new_site', 'Job']

View File

@ -29,7 +29,6 @@ from requests.structures import CaseInsensitiveDict
import datetime import datetime
import base64 import base64
from brozzler.chrome import Chrome from brozzler.chrome import Chrome
import surt
import socket import socket
class BrowsingException(Exception): class BrowsingException(Exception):

View File

@ -541,7 +541,7 @@ def brozzler_list_captures():
Handy utility for looking up entries in the rethinkdb "captures" table by Handy utility for looking up entries in the rethinkdb "captures" table by
url or sha1. url or sha1.
''' '''
import surt import urlcanon
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]), prog=os.path.basename(sys.argv[0]),
@ -579,9 +579,7 @@ def brozzler_list_captures():
logging.debug('querying rethinkdb: %s', reql) logging.debug('querying rethinkdb: %s', reql)
results = reql.run() results = reql.run()
else: else:
key = surt.surt( key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii')
args.url_or_sha1, trailing_comma=True, host_massage=False,
with_scheme=True)
abbr_start_key = key[:150] abbr_start_key = key[:150]
if args.prefix: if args.prefix:
# surt is necessarily ascii and \x7f is the last ascii character # surt is necessarily ascii and \x7f is the last ascii character

View File

@ -23,6 +23,7 @@ import time
import datetime import datetime
import rethinkdb as r import rethinkdb as r
import doublethink import doublethink
import urlcanon
class UnexpectedDbResult(Exception): class UnexpectedDbResult(Exception):
pass pass
@ -261,18 +262,21 @@ class RethinkDbFrontier:
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
if site.remember_outlinks: if site.remember_outlinks:
parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]} decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
counts = {"added":0,"updated":0,"rejected":0,"blocked":0} counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
for url in outlinks or []: for url in outlinks or []:
u = brozzler.site.Url(url) url_for_scoping = urlcanon.semantic(url)
if site.is_in_scope(u, parent_page=parent_page): url_for_crawling = urlcanon.whatwg(url)
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
if brozzler.is_permitted_by_robots(site, url): if brozzler.is_permitted_by_robots(site, url):
if not u.surt.startswith(site.scope["surt"]): if not url_for_scoping.surt().startswith(
site.scope["surt"].encode("utf-8")):
hops_off_surt = parent_page.hops_off_surt + 1 hops_off_surt = parent_page.hops_off_surt + 1
else: else:
hops_off_surt = 0 hops_off_surt = 0
new_child_page = brozzler.Page(self.rr, { new_child_page = brozzler.Page(self.rr, {
'url': url, 'site_id': site.id, 'job_id': site.job_id, 'url': str(url_for_crawling),
'site_id': site.id, 'job_id': site.job_id,
'hops_from_seed': parent_page.hops_from_seed+1, 'hops_from_seed': parent_page.hops_from_seed+1,
'via_page_id': parent_page.id, 'via_page_id': parent_page.id,
'hops_off_surt': hops_off_surt}) 'hops_off_surt': hops_off_surt})
@ -286,17 +290,20 @@ class RethinkDbFrontier:
new_child_page.save() new_child_page.save()
counts["added"] += 1 counts["added"] += 1
if site.remember_outlinks: if site.remember_outlinks:
parent_page.outlinks["accepted"].append(url) decisions["accepted"].add(str(url_for_crawling))
else: else:
counts["blocked"] += 1 counts["blocked"] += 1
if site.remember_outlinks: if site.remember_outlinks:
parent_page.outlinks["blocked"].append(url) decisions["blocked"].add(str(url_for_crawling))
else: else:
counts["rejected"] += 1 counts["rejected"] += 1
if site.remember_outlinks: if site.remember_outlinks:
parent_page.outlinks["rejected"].append(url) decisions["rejected"].add(str(url_for_crawling))
if site.remember_outlinks: if site.remember_outlinks:
parent_page.outlinks = {}
for k in decisions:
parent_page.outlinks[k] = list(decisions[k])
parent_page.save() parent_page.save()
self.logger.info( self.logger.info(

View File

@ -36,7 +36,7 @@ except ImportError as e:
sys.exit(1) sys.exit(1)
import doublethink import doublethink
import rethinkdb as r import rethinkdb as r
import surt import urlcanon
import json import json
import brozzler import brozzler
import argparse import argparse
@ -116,9 +116,7 @@ class TheGoodUrlCanonicalizer(object):
def __call__(self, url): def __call__(self, url):
try: try:
key = surt.surt( key = urlcanon.semantic(url).surt().decode('ascii')
url, trailing_comma=True, host_massage=False,
with_scheme=True)
# logging.debug('%s -> %s', url, key) # logging.debug('%s -> %s', url, key)
return key return key
except Exception as e: except Exception as e:

View File

@ -16,7 +16,7 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' '''
import surt import urlcanon
import json import json
import logging import logging
import brozzler import brozzler
@ -25,69 +25,10 @@ import time
import doublethink import doublethink
import datetime import datetime
import re import re
import ipaddress
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC) tzinfo=doublethink.UTC)
class Url:
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
try:
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
# XXX chop off path after last slash??
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
except Exception as e:
logging.warn('problem surting %s - %s', repr(self.url), e)
return self._surt
@property
def host(self):
if not self._host:
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
"""
Returns true if
- ip_or_domain is an ip address and self.host is the same ip address
- ip_or_domain is a domain and self.host is the same domain
- ip_or_domain is a domain and self.host is a subdomain of it
"""
if not self.host:
return False
if ip_or_domain == self.host:
return True
# if either ip_or_domain or self.host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(self.host)
return False
except:
pass
# if we get here, we're looking at two hostnames
domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
host_parts = self.host.encode("idna").decode("ascii").lower().split(".")
return host_parts[-len(domain_parts):] == domain_parts
class Site(doublethink.Document): class Site(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
table = 'sites' table = 'sites'
@ -105,8 +46,9 @@ class Site(doublethink.Document):
self.last_claimed = _EPOCH_UTC self.last_claimed = _EPOCH_UTC
if not "scope" in self: if not "scope" in self:
self.scope = {} self.scope = {}
if not "surt" in self.scope: if not "surt" in self.scope and self.seed:
self.scope["surt"] = Url(self.seed).surt self.scope["surt"] = brozzler.site_surt_canon(
self.seed).surt().decode('ascii')
if not "starts_and_stops" in self: if not "starts_and_stops" in self:
if self.get("start_time"): # backward compatibility if self.get("start_time"): # backward compatibility
@ -135,7 +77,7 @@ class Site(doublethink.Document):
return dt return dt
def note_seed_redirect(self, url): def note_seed_redirect(self, url):
new_scope_surt = Url(url).surt new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
if not new_scope_surt.startswith(self.scope["surt"]): if not new_scope_surt.startswith(self.scope["surt"]):
self.logger.info("changing site scope surt from {} to {}".format( self.logger.info("changing site scope surt from {} to {}".format(
self.scope["surt"], new_scope_surt)) self.scope["surt"], new_scope_surt))
@ -149,149 +91,50 @@ class Site(doublethink.Document):
return hdrs return hdrs
def is_in_scope(self, url, parent_page=None): def is_in_scope(self, url, parent_page=None):
if not isinstance(url, Url): if not isinstance(url, urlcanon.ParsedUrl):
u = Url(url) url = urlcanon.semantic(url)
else: if parent_page:
u = url parent_url = urlcanon.semantic(parent_page.url)
might_accept = False might_accept = False
if not u.surt: if not url.scheme in (b'http', b'https'):
return False
elif not u.surt.startswith("http://") and not u.surt.startswith("https://"):
# XXX doesn't belong here maybe (where? worker ignores unknown # XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?) # schemes?)
return False return False
elif (parent_page and "max_hops" in self.scope elif (parent_page and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]): and parent_page.hops_from_seed >= self.scope["max_hops"]):
pass pass
elif u.surt.startswith(self.scope["surt"]): elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
might_accept = True might_accept = True
elif parent_page and parent_page.hops_off_surt < self.scope.get( elif parent_page and parent_page.hops_off_surt < self.scope.get(
"max_hops_off_surt", 0): "max_hops_off_surt", 0):
might_accept = True might_accept = True
elif "accepts" in self.scope: elif "accepts" in self.scope:
for rule in self.scope["accepts"]: for accept_rule in self.scope["accepts"]:
if self._scope_rule_applies(rule, u, parent_page): rule = urlcanon.MatchRule(**accept_rule)
if rule.applies(url, parent_url):
might_accept = True might_accept = True
break break
if might_accept: if might_accept:
if "blocks" in self.scope: if "blocks" in self.scope:
for rule in self.scope["blocks"]: for block_rule in self.scope["blocks"]:
if self._scope_rule_applies(rule, u, parent_page): rule = urlcanon.MatchRule(**block_rule)
if rule.applies(url, parent_url):
return False return False
return True return True
else: else:
return False return False
def _normalize_rule(self, rule):
"""
Normalizes a scope rule.
A scope rule is considered deprecated if it contains a `url_match` and
`value`. This method converts such scope rules to the preferred style
and returns the new rule. If `rule` is not a deprecated-style rule,
returns it unchanged.
"""
if "url_match" in rule and "value" in rule:
new_rule = dict(rule)
url_match = new_rule.pop("url_match")
if url_match == "REGEX_MATCH":
new_rule["regex"] = new_rule.pop("value")
elif url_match == "SURT_MATCH":
new_rule["surt"] = new_rule.pop("value")
elif url_match == "STRING_MATCH":
new_rule["substring"] = new_rule.pop("value")
else:
raise Exception("invalid scope rule")
return new_rule
else:
return rule
def _scope_rule_applies(self, rule, url, parent_page=None):
"""
Examples of valid rules expressed as yaml.
- domain: bad.domain.com
# preferred:
- domain: monkey.org
substring: bar
# deprecated version of the same:
- domain: monkey.org
url_match: STRING_MATCH
value: bar
# preferred:
- surt: http://(com,woop,)/fuh/
# deprecated version of the same:
- url_match: SURT_MATCH
value: http://(com,woop,)/fuh/
# preferred:
- regex: ^https?://(www.)?youtube.com/watch?.*$
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
# deprecated version of the same:
- url_match: REGEX_MATCH
value: ^https?://(www.)?youtube.com/watch?.*$
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
"""
if not isinstance(url, Url):
u = Url(url)
else:
u = url
try:
rewl = self._normalize_rule(rule)
except Exception as e:
self.logger.error(
"problem normalizing scope rule %s - %s", rule, e)
return False
invalid_keys = rewl.keys() - {
"domain", "surt", "substring", "regex", "parent_url_regex"}
if invalid_keys:
self.logger.error(
"invalid keys %s in scope rule %s", invalid_keys, rule)
return False
if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]):
return False
if "surt" in rewl and not u.surt.startswith(rewl["surt"]):
return False
if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0:
return False
if "regex" in rewl:
try:
if not re.fullmatch(rewl["regex"], u.url):
return False
except Exception as e:
self.logger.error(
"caught exception matching against regex %s - %s",
rewl["regex"], e)
return False
if "parent_url_regex" in rewl:
if not parent_page:
return False
pu = Url(parent_page.url)
try:
if not re.fullmatch(rule["parent_url_regex"], pu.url):
return False
except Exception as e:
self.logger.error(
"caught exception matching against regex %s - %s",
rule["parent_url_regex"], e)
return False
return True
class Page(doublethink.Document): class Page(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
table = "pages" table = "pages"
@staticmethod
def compute_id(site_id, url):
digest_this = "site_id:%s,url:%s" % (site_id, url)
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def populate_defaults(self): def populate_defaults(self):
if not "hops_from_seed" in self: if not "hops_from_seed" in self:
self.hops_from_seed = 0 self.hops_from_seed = 0
@ -306,8 +149,7 @@ class Page(doublethink.Document):
if not "priority" in self: if not "priority" in self:
self.priority = self._calc_priority() self.priority = self._calc_priority()
if not "id" in self: if not "id" in self:
digest_this = "site_id:%s,url:%s" % (self.site_id, self.url) self.id = self.compute_id(self.site_id, self.url)
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def __str__(self): def __str__(self):
return 'Page({"id":"%s","url":"%s",...})' % (self.id, self.url) return 'Page({"id":"%s","url":"%s",...})' % (self.id, self.url)
@ -327,7 +169,6 @@ class Page(doublethink.Document):
if not self.url: if not self.url:
return None return None
if self._canon_hurl is None: if self._canon_hurl is None:
self._canon_hurl = surt.handyurl.parse(self.url) self._canon_hurl = urlcanon.semantic(self.url)
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl) return str(self._canon_hurl)
return self._canon_hurl.geturl()

View File

@ -33,6 +33,7 @@ import collections
import requests import requests
import doublethink import doublethink
import tempfile import tempfile
import urlcanon
class ExtraHeaderAdder(urllib.request.BaseHandler): class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers): def __init__(self, extra_headers):
@ -208,7 +209,7 @@ class BrozzlerWorker:
"with youtube-dl json for %s", page) "with youtube-dl json for %s", page)
self._warcprox_write_record( self._warcprox_write_record(
warcprox_address=self._proxy(site), warcprox_address=self._proxy(site),
url="youtube-dl:%s" % brozzler.fixup(page.url), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata", warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"), payload=info_json.encode("utf-8"),
@ -245,7 +246,7 @@ class BrozzlerWorker:
def _on_screenshot(screenshot_png): def _on_screenshot(screenshot_png):
if on_screenshot: if on_screenshot:
on_screenshot(screenshot_png) on_screenshot(screenshot_png)
elif self._proxy(site) and self._enable_warcprox_features(site): if self._proxy(site) and self._enable_warcprox_features(site):
self.logger.info( self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with " "sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy(site), page) "screenshot for %s", self._proxy(site), page)
@ -253,13 +254,13 @@ class BrozzlerWorker:
screenshot_png) screenshot_png)
self._warcprox_write_record( self._warcprox_write_record(
warcprox_address=self._proxy(site), warcprox_address=self._proxy(site),
url="screenshot:%s" % brozzler.fixup(page.url, True), url="screenshot:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg", warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg, payload=screenshot_jpeg,
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())
self._warcprox_write_record( self._warcprox_write_record(
warcprox_address=self._proxy(site), warcprox_address=self._proxy(site),
url="thumbnail:%s" % brozzler.fixup(page.url, True), url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg", warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg, payload=thumbnail_jpeg,
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev202', version='1.1b9.dev203',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',
@ -68,7 +68,7 @@ setuptools.setup(
'requests', 'requests',
'websocket-client!=0.39.0', 'websocket-client!=0.39.0',
'pillow==3.3.0', 'pillow==3.3.0',
'surt>=0.3.0', 'urlcanon>=0.1.dev16',
'doublethink>=0.2.0.dev71', 'doublethink>=0.2.0.dev71',
'rethinkdb>=2.3,<2.4', 'rethinkdb>=2.3,<2.4',
'cerberus==1.0.1', 'cerberus==1.0.1',
@ -76,7 +76,12 @@ setuptools.setup(
], ],
extras_require={ extras_require={
'dashboard': ['flask>=0.11', 'gunicorn'], 'dashboard': ['flask>=0.11', 'gunicorn'],
'easy': ['warcprox>=2.0b2', 'pywb', 'flask>=0.11', 'gunicorn'], 'easy': [
'warcprox>=2.1b1.dev57',
'pywb',
'flask>=0.11',
'gunicorn'
],
}, },
zip_safe=False, zip_safe=False,
classifiers=[ classifiers=[

View File

@ -75,7 +75,6 @@ blocks:
- domain: twitter.com - domain: twitter.com
url_match: REGEX_MATCH url_match: REGEX_MATCH
value: ^.*lang=(?!en).*$ value: ^.*lang=(?!en).*$
- bad_thing: bad rule should be ignored
''') ''')
site = brozzler.Site(None, { site = brozzler.Site(None, {

View File

@ -16,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
echo echo
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest' vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v -s /brozzler/tests $@" vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@"