support for host rules in outlink scoping

This commit is contained in:
Noah Levitt 2016-05-03 20:52:22 +00:00
parent 1d21f2c307
commit 0af00bb3d5
4 changed files with 119 additions and 45 deletions

View file

@ -258,10 +258,10 @@ class RethinkDbFrontier:
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
counts = {"added":0,"updated":0,"rejected":0,"blocked":0} counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
for url in outlinks or []: for url in outlinks or []:
surt_ = brozzler.site.to_surt(url) u = brozzler.site.Url(url)
if site.is_in_scope(url, surt_=surt_, parent_page=parent_page): if site.is_in_scope(u, parent_page=parent_page):
if brozzler.is_permitted_by_robots(site, url): if brozzler.is_permitted_by_robots(site, url):
if not surt_.startswith(site.scope["surt"]): if not u.surt.startswith(site.scope["surt"]):
hops_off_surt = parent_page.hops_off_surt + 1 hops_off_surt = parent_page.hops_off_surt + 1
else: else:
hops_off_surt = 0 hops_off_surt = 0

View file

@ -25,8 +25,62 @@ import time
import rethinkstuff import rethinkstuff
import datetime import datetime
import re import re
import ipaddress
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC) _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=rethinkstuff.UTC)
class Url:
def __init__(self, url):
self.url = url
self._surt = None
self._host = None
@property
def surt(self):
if not self._surt:
hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
# XXX chop off path after last slash??
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
return self._surt
@property
def host(self):
if not self._host:
self._host = surt.handyurl.parse(self.url).host
return self._host
def matches_ip_or_domain(self, ip_or_domain):
"""Returns true if
- ip_or_domain is an ip address and self.host is the same ip address
- ip_or_domain is a domain and self.host is the same domain
- ip_or_domain is a domain and self.host is a subdomain of it
"""
if ip_or_domain == self.host:
return True
# if either ip_or_domain or self.host are ip addresses, and they're not
# identical (previous check), not a match
try:
ipaddress.ip_address(ip_or_domain)
return False
except:
pass
try:
ipaddress.ip_address(self.host)
return False
except:
pass
# if we get here, we're looking at two hostnames
# XXX do we need to handle case of one punycoded idn, other not?
domain_parts = ip_or_domain.split(".")
host_parts = self.host.split(".")
return host_parts[-len(domain_parts):] == domain_parts
class Site(brozzler.BaseDictable): class Site(brozzler.BaseDictable):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -58,7 +112,7 @@ class Site(brozzler.BaseDictable):
self.scope = scope or {} self.scope = scope or {}
if not "surt" in self.scope: if not "surt" in self.scope:
self.scope["surt"] = self._to_surt(seed) self.scope["surt"] = Url(seed).surt
def __repr__(self): def __repr__(self):
return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format( return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
@ -69,72 +123,95 @@ class Site(brozzler.BaseDictable):
def __str__(self): def __str__(self):
return "Site-%s-%s" % (self.id, self.seed) return "Site-%s-%s" % (self.id, self.seed)
def _to_surt(self, url):
hurl = surt.handyurl.parse(url)
surt.GoogleURLCanonicalizer.canonicalize(hurl)
hurl.query = None
hurl.hash = None
# XXX chop off path after last slash??
return hurl.getURLString(surt=True, trailing_comma=True)
def note_seed_redirect(self, url): def note_seed_redirect(self, url):
new_scope_surt = self._to_surt(url) new_scope_surt = Url(url).surt
if not new_scope_surt.startswith(self.scope["surt"]): if not new_scope_surt.startswith(self.scope["surt"]):
self.logger.info("changing site scope surt from {} to {}".format( self.logger.info("changing site scope surt from {} to {}".format(
self.scope["surt"], new_scope_surt)) self.scope["surt"], new_scope_surt))
self.scope["surt"] = new_scope_surt self.scope["surt"] = new_scope_surt
def is_in_scope(self, url, surt_=None, parent_page=None): def is_in_scope(self, url, parent_page=None):
if not surt_: if not isinstance(url, Url):
surt_ = to_surt(url) u = Url(url)
might_accept = False else:
u = url
if not surt_.startswith("http://") and not surt_.startswith("https://"): might_accept = False
if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
# XXX doesn't belong here maybe (where? worker ignores unknown # XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?) # schemes?)
return False return False
elif (parent_page and "max_hops" in self.scope elif (parent_page and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]): and parent_page.hops_from_seed >= self.scope["max_hops"]):
pass pass
elif surt_.startswith(self.scope["surt"]): elif u.surt.startswith(self.scope["surt"]):
might_accept = True might_accept = True
elif parent_page and parent_page.hops_off_surt < self.scope.get( elif parent_page and parent_page.hops_off_surt < self.scope.get(
"max_hops_off_surt", 0): "max_hops_off_surt", 0):
might_accept = True might_accept = True
elif "accepts" in self.scope: elif "accepts" in self.scope:
for rule in self.scope["accepts"]: for rule in self.scope["accepts"]:
if self._scope_rule_applies(rule, url, surt_): if self._scope_rule_applies(rule, u):
might_accept = True might_accept = True
break
if might_accept: if might_accept:
if "blocks" in self.scope: if "blocks" in self.scope:
for rule in self.scope["blocks"]: for rule in self.scope["blocks"]:
if self._scope_rule_applies(rule, url, surt_): if self._scope_rule_applies(rule, u):
return False return False
return True return True
else: else:
return False return False
def _scope_rule_applies(self, rule, url, surt_): def _scope_rule_applies(self, rule, url):
if not "url_match" in rule or not "value" in rule: """
self.logger.warn("unable to make sense of scope rule %s", rule) Examples of valid rules:
return False [
if rule["url_match"] == "STRING_MATCH": {
return url.find(rule["value"]) >= 0 "host": "monkey.org",
elif rule["url_match"] == "REGEX_MATCH": "url_match": "STRING_MATCH",
try: "value": "bar",
return re.fullmatch(rule["value"], url) },
except Exception as e: {
self.logger.warn( "url_match": "SURT_MATCH",
"caught exception matching against regex %s: %s", "value": "+http://(com,woop,)/fuh/",
rule["value"], e) },
return False {
elif rule["url_match"] == "SURT_MATCH": "host": "badhost.com",
return surt_.startswith(rule["value"]) },
]
"""
if not isinstance(url, Url):
u = Url(url)
else: else:
self.logger.warn("invalid rule.url_match=%s", rule.url_match) u = url
return False
if "host" in rule and not u.matches_ip_or_domain(rule["host"]):
return False
if "url_match" in rule:
if rule["url_match"] == "STRING_MATCH":
return u.url.find(rule["value"]) >= 0
elif rule["url_match"] == "REGEX_MATCH":
try:
return re.fullmatch(rule["value"], u.url)
except Exception as e:
self.logger.warn(
"caught exception matching against regex %s: %s",
rule["value"], e)
return False
elif rule["url_match"] == "SURT_MATCH":
return u.surt.startswith(rule["value"])
else:
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
return False
else:
if "host" in rule:
# we already know that it matches from earlier check
return True
else:
self.logger.warn("unable to make sense of scope rule %s", rule)
return False
class Page(brozzler.BaseDictable): class Page(brozzler.BaseDictable):
def __init__( def __init__(
@ -183,7 +260,3 @@ class Page(brozzler.BaseDictable):
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl) surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
return self._canon_hurl.geturl() return self._canon_hurl.geturl()
def to_surt(url):
hurl = surt.handyurl.parse(url)
return surt.GoogleURLCanonicalizer.canonicalize(
hurl).getURLString(surt=True, trailing_comma=True)

View file

@ -34,6 +34,7 @@ import socket
import datetime import datetime
import collections import collections
import requests import requests
import rethinkstuff
class ExtraHeaderAdder(urllib.request.BaseHandler): class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers): def __init__(self, extra_headers):

View file

@ -20,7 +20,7 @@ import setuptools
import glob import glob
setuptools.setup(name='brozzler', setuptools.setup(name='brozzler',
version='1.1.dev6', version='1.1.dev7',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler', url='https://github.com/nlevitt/brozzler',
author='Noah Levitt', author='Noah Levitt',