use urlcanon library for canonicalization, surtification, scope match rules

This commit is contained in:
Noah Levitt 2017-03-15 14:59:51 -07:00
parent 479f0f7e09
commit 12fb9eaa15
11 changed files with 78 additions and 232 deletions

View File

@ -9,7 +9,7 @@ install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR pytest
script:
- DISPLAY=:1 py.test -v -s tests
- DISPLAY=:1 py.test -v tests
after_failure:
- sudo cat /var/log/upstart/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log

View File

@ -50,31 +50,18 @@ class ReachedLimit(Exception):
def __str__(self):
return self.__repr__()
def fixup(url, hash_strip=False):
'''
Does rudimentary canonicalization, such as converting IDN to punycode.
'''
import surt
hurl = surt.handyurl.parse(url)
if hash_strip:
hurl.hash = None
# handyurl.parse() already lowercases the scheme via urlsplit
if hurl.host:
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
return hurl.getURLString()
# monkey-patch log level TRACE
TRACE = 5
import logging as _logging
import logging
def _logging_trace(msg, *args, **kwargs):
_logging.root.trace(msg, *args, **kwargs)
logging.root.trace(msg, *args, **kwargs)
def _logger_trace(self, msg, *args, **kwargs):
if self.isEnabledFor(TRACE):
self._log(TRACE, msg, args, **kwargs)
_logging.trace = _logging_trace
_logging.Logger.trace = _logger_trace
_logging._levelToName[TRACE] = 'TRACE'
_logging._nameToLevel['TRACE'] = TRACE
logging.trace = _logging_trace
logging.Logger.trace = _logger_trace
logging._levelToName[TRACE] = 'TRACE'
logging._nameToLevel['TRACE'] = TRACE
_behaviors = None
def behaviors():
@ -158,6 +145,14 @@ def jinja2_environment():
_jinja2_env.filters['json'] = json.dumps
return _jinja2_env
import urlcanon
def _remove_query(url):
url.question_mark = b''
url.query = b''
# XXX chop off path after last slash??
site_surt_canon = urlcanon.Canonicalizer(
urlcanon.semantic.steps + [_remove_query])
from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots
@ -166,3 +161,6 @@ from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.job import new_job, new_site, Job
from brozzler.cli import suggest_default_chrome_exe
__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
'new_job', 'new_site', 'Job']

View File

@ -29,7 +29,6 @@ from requests.structures import CaseInsensitiveDict
import datetime
import base64
from brozzler.chrome import Chrome
import surt
import socket
class BrowsingException(Exception):

View File

@ -541,7 +541,7 @@ def brozzler_list_captures():
Handy utility for looking up entries in the rethinkdb "captures" table by
url or sha1.
'''
import surt
import urlcanon
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
@ -579,9 +579,7 @@ def brozzler_list_captures():
logging.debug('querying rethinkdb: %s', reql)
results = reql.run()
else:
key = surt.surt(
args.url_or_sha1, trailing_comma=True, host_massage=False,
with_scheme=True)
key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii')
abbr_start_key = key[:150]
if args.prefix:
# surt is necessarily ascii and \x7f is the last ascii character

View File

@ -23,6 +23,7 @@ import time
import datetime
import rethinkdb as r
import doublethink
import urlcanon
class UnexpectedDbResult(Exception):
pass
@ -261,18 +262,21 @@ class RethinkDbFrontier:
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
if site.remember_outlinks:
parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]}
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
for url in outlinks or []:
u = brozzler.site.Url(url)
if site.is_in_scope(u, parent_page=parent_page):
url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url)
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
if brozzler.is_permitted_by_robots(site, url):
if not u.surt.startswith(site.scope["surt"]):
if not url_for_scoping.surt().startswith(
site.scope["surt"].encode("utf-8")):
hops_off_surt = parent_page.hops_off_surt + 1
else:
hops_off_surt = 0
new_child_page = brozzler.Page(self.rr, {
'url': url, 'site_id': site.id, 'job_id': site.job_id,
'url': str(url_for_crawling),
'site_id': site.id, 'job_id': site.job_id,
'hops_from_seed': parent_page.hops_from_seed+1,
'via_page_id': parent_page.id,
'hops_off_surt': hops_off_surt})
@ -286,17 +290,20 @@ class RethinkDbFrontier:
new_child_page.save()
counts["added"] += 1
if site.remember_outlinks:
parent_page.outlinks["accepted"].append(url)
decisions["accepted"].add(str(url_for_crawling))
else:
counts["blocked"] += 1
if site.remember_outlinks:
parent_page.outlinks["blocked"].append(url)
decisions["blocked"].add(str(url_for_crawling))
else:
counts["rejected"] += 1
if site.remember_outlinks:
parent_page.outlinks["rejected"].append(url)
decisions["rejected"].add(str(url_for_crawling))
if site.remember_outlinks:
parent_page.outlinks = {}
for k in decisions:
parent_page.outlinks[k] = list(decisions[k])
parent_page.save()
self.logger.info(

View File

@ -36,7 +36,7 @@ except ImportError as e:
sys.exit(1)
import doublethink
import rethinkdb as r
import surt
import urlcanon
import json
import brozzler
import argparse
@ -116,9 +116,7 @@ class TheGoodUrlCanonicalizer(object):
def __call__(self, url):
try:
key = surt.surt(
url, trailing_comma=True, host_massage=False,
with_scheme=True)
key = urlcanon.semantic(url).surt().decode('ascii')
# logging.debug('%s -> %s', url, key)
return key
except Exception as e:

View File

@ -16,7 +16,7 @@ See the License for the specific language governing permissions and
limitations under the License.
'''
import surt
import urlcanon
import json
import logging
import brozzler
@ -25,69 +25,10 @@ import time
import doublethink
import datetime
import re
import ipaddress
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC)
class Url:
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
try:
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
# XXX chop off path after last slash??
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
except Exception as e:
logging.warn('problem surting %s - %s', repr(self.url), e)
return self._surt
@property
def host(self):
if not self._host:
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
"""
Returns true if
- ip_or_domain is an ip address and self.host is the same ip address
- ip_or_domain is a domain and self.host is the same domain
- ip_or_domain is a domain and self.host is a subdomain of it
"""
if not self.host:
return False
if ip_or_domain == self.host:
return True
# if either ip_or_domain or self.host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(self.host)
return False
except:
pass
# if we get here, we're looking at two hostnames
domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
host_parts = self.host.encode("idna").decode("ascii").lower().split(".")
return host_parts[-len(domain_parts):] == domain_parts
class Site(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = 'sites'
@ -105,8 +46,9 @@ class Site(doublethink.Document):
self.last_claimed = _EPOCH_UTC
if not "scope" in self:
self.scope = {}
if not "surt" in self.scope:
self.scope["surt"] = Url(self.seed).surt
if not "surt" in self.scope and self.seed:
self.scope["surt"] = brozzler.site_surt_canon(
self.seed).surt().decode('ascii')
if not "starts_and_stops" in self:
if self.get("start_time"): # backward compatibility
@ -135,7 +77,7 @@ class Site(doublethink.Document):
return dt
def note_seed_redirect(self, url):
new_scope_surt = Url(url).surt
new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
if not new_scope_surt.startswith(self.scope["surt"]):
self.logger.info("changing site scope surt from {} to {}".format(
self.scope["surt"], new_scope_surt))
@ -149,149 +91,50 @@ class Site(doublethink.Document):
return hdrs
def is_in_scope(self, url, parent_page=None):
if not isinstance(url, Url):
u = Url(url)
else:
u = url
if not isinstance(url, urlcanon.ParsedUrl):
url = urlcanon.semantic(url)
if parent_page:
parent_url = urlcanon.semantic(parent_page.url)
might_accept = False
if not u.surt:
return False
elif not u.surt.startswith("http://") and not u.surt.startswith("https://"):
if not url.scheme in (b'http', b'https'):
# XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?)
return False
elif (parent_page and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]):
pass
elif u.surt.startswith(self.scope["surt"]):
elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
might_accept = True
elif parent_page and parent_page.hops_off_surt < self.scope.get(
"max_hops_off_surt", 0):
might_accept = True
elif "accepts" in self.scope:
for rule in self.scope["accepts"]:
if self._scope_rule_applies(rule, u, parent_page):
might_accept = True
break
for accept_rule in self.scope["accepts"]:
rule = urlcanon.MatchRule(**accept_rule)
if rule.applies(url, parent_url):
might_accept = True
break
if might_accept:
if "blocks" in self.scope:
for rule in self.scope["blocks"]:
if self._scope_rule_applies(rule, u, parent_page):
for block_rule in self.scope["blocks"]:
rule = urlcanon.MatchRule(**block_rule)
if rule.applies(url, parent_url):
return False
return True
else:
return False
def _normalize_rule(self, rule):
"""
Normalizes a scope rule.
A scope rule is considered deprecated if it contains a `url_match` and
`value`. This method converts such scope rules to the preferred style
and returns the new rule. If `rule` is not a deprecated-style rule,
returns it unchanged.
"""
if "url_match" in rule and "value" in rule:
new_rule = dict(rule)
url_match = new_rule.pop("url_match")
if url_match == "REGEX_MATCH":
new_rule["regex"] = new_rule.pop("value")
elif url_match == "SURT_MATCH":
new_rule["surt"] = new_rule.pop("value")
elif url_match == "STRING_MATCH":
new_rule["substring"] = new_rule.pop("value")
else:
raise Exception("invalid scope rule")
return new_rule
else:
return rule
def _scope_rule_applies(self, rule, url, parent_page=None):
"""
Examples of valid rules expressed as yaml.
- domain: bad.domain.com
# preferred:
- domain: monkey.org
substring: bar
# deprecated version of the same:
- domain: monkey.org
url_match: STRING_MATCH
value: bar
# preferred:
- surt: http://(com,woop,)/fuh/
# deprecated version of the same:
- url_match: SURT_MATCH
value: http://(com,woop,)/fuh/
# preferred:
- regex: ^https?://(www.)?youtube.com/watch?.*$
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
# deprecated version of the same:
- url_match: REGEX_MATCH
value: ^https?://(www.)?youtube.com/watch?.*$
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
"""
if not isinstance(url, Url):
u = Url(url)
else:
u = url
try:
rewl = self._normalize_rule(rule)
except Exception as e:
self.logger.error(
"problem normalizing scope rule %s - %s", rule, e)
return False
invalid_keys = rewl.keys() - {
"domain", "surt", "substring", "regex", "parent_url_regex"}
if invalid_keys:
self.logger.error(
"invalid keys %s in scope rule %s", invalid_keys, rule)
return False
if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]):
return False
if "surt" in rewl and not u.surt.startswith(rewl["surt"]):
return False
if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0:
return False
if "regex" in rewl:
try:
if not re.fullmatch(rewl["regex"], u.url):
return False
except Exception as e:
self.logger.error(
"caught exception matching against regex %s - %s",
rewl["regex"], e)
return False
if "parent_url_regex" in rewl:
if not parent_page:
return False
pu = Url(parent_page.url)
try:
if not re.fullmatch(rule["parent_url_regex"], pu.url):
return False
except Exception as e:
self.logger.error(
"caught exception matching against regex %s - %s",
rule["parent_url_regex"], e)
return False
return True
class Page(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = "pages"
@staticmethod
def compute_id(site_id, url):
digest_this = "site_id:%s,url:%s" % (site_id, url)
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def populate_defaults(self):
if not "hops_from_seed" in self:
self.hops_from_seed = 0
@ -306,8 +149,7 @@ class Page(doublethink.Document):
if not "priority" in self:
self.priority = self._calc_priority()
if not "id" in self:
digest_this = "site_id:%s,url:%s" % (self.site_id, self.url)
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
self.id = self.compute_id(self.site_id, self.url)
def __str__(self):
return 'Page({"id":"%s","url":"%s",...})' % (self.id, self.url)
@ -327,7 +169,6 @@ class Page(doublethink.Document):
if not self.url:
return None
if self._canon_hurl is None:
self._canon_hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
return self._canon_hurl.geturl()
self._canon_hurl = urlcanon.semantic(self.url)
return str(self._canon_hurl)

View File

@ -33,6 +33,7 @@ import collections
import requests
import doublethink
import tempfile
import urlcanon
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
@ -208,7 +209,7 @@ class BrozzlerWorker:
"with youtube-dl json for %s", page)
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="youtube-dl:%s" % brozzler.fixup(page.url),
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
@ -245,7 +246,7 @@ class BrozzlerWorker:
def _on_screenshot(screenshot_png):
if on_screenshot:
on_screenshot(screenshot_png)
elif self._proxy(site) and self._enable_warcprox_features(site):
if self._proxy(site) and self._enable_warcprox_features(site):
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy(site), page)
@ -253,13 +254,13 @@ class BrozzlerWorker:
screenshot_png)
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="screenshot:%s" % brozzler.fixup(page.url, True),
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg,
extra_headers=site.extra_headers())
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="thumbnail:%s" % brozzler.fixup(page.url, True),
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg,
extra_headers=site.extra_headers())

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev202',
version='1.1b9.dev203',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -68,7 +68,7 @@ setuptools.setup(
'requests',
'websocket-client!=0.39.0',
'pillow==3.3.0',
'surt>=0.3.0',
'urlcanon>=0.1.dev16',
'doublethink>=0.2.0.dev71',
'rethinkdb>=2.3,<2.4',
'cerberus==1.0.1',
@ -76,7 +76,12 @@ setuptools.setup(
],
extras_require={
'dashboard': ['flask>=0.11', 'gunicorn'],
'easy': ['warcprox>=2.0b2', 'pywb', 'flask>=0.11', 'gunicorn'],
'easy': [
'warcprox>=2.1b1.dev57',
'pywb',
'flask>=0.11',
'gunicorn'
],
},
zip_safe=False,
classifiers=[

View File

@ -75,7 +75,6 @@ blocks:
- domain: twitter.com
url_match: REGEX_MATCH
value: ^.*lang=(?!en).*$
- bad_thing: bad rule should be ignored
''')
site = brozzler.Site(None, {

View File

@ -16,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
echo
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v -s /brozzler/tests $@"
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@"