mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 12:24:20 -04:00
support for host rules in outlink scoping
This commit is contained in:
parent
1d21f2c307
commit
0af00bb3d5
4 changed files with 119 additions and 45 deletions
|
@ -258,10 +258,10 @@ class RethinkDbFrontier:
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
surt_ = brozzler.site.to_surt(url)
|
u = brozzler.site.Url(url)
|
||||||
if site.is_in_scope(url, surt_=surt_, parent_page=parent_page):
|
if site.is_in_scope(u, parent_page=parent_page):
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, url):
|
||||||
if not surt_.startswith(site.scope["surt"]):
|
if not u.surt.startswith(site.scope["surt"]):
|
||||||
hops_off_surt = parent_page.hops_off_surt + 1
|
hops_off_surt = parent_page.hops_off_surt + 1
|
||||||
else:
|
else:
|
||||||
hops_off_surt = 0
|
hops_off_surt = 0
|
||||||
|
|
133
brozzler/site.py
133
brozzler/site.py
|
@ -25,8 +25,62 @@ import time
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
|
import ipaddress
|
||||||
|
|
||||||
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC)
|
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||||
|
tzinfo=rethinkstuff.UTC)
|
||||||
|
|
||||||
|
class Url:
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self._surt = None
|
||||||
|
self._host = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def surt(self):
|
||||||
|
if not self._surt:
|
||||||
|
hurl = surt.handyurl.parse(self.url)
|
||||||
|
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||||
|
hurl.query = None
|
||||||
|
hurl.hash = None
|
||||||
|
# XXX chop off path after last slash??
|
||||||
|
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||||
|
return self._surt
|
||||||
|
|
||||||
|
@property
|
||||||
|
def host(self):
|
||||||
|
if not self._host:
|
||||||
|
self._host = surt.handyurl.parse(self.url).host
|
||||||
|
return self._host
|
||||||
|
|
||||||
|
def matches_ip_or_domain(self, ip_or_domain):
|
||||||
|
"""Returns true if
|
||||||
|
- ip_or_domain is an ip address and self.host is the same ip address
|
||||||
|
- ip_or_domain is a domain and self.host is the same domain
|
||||||
|
- ip_or_domain is a domain and self.host is a subdomain of it
|
||||||
|
"""
|
||||||
|
if ip_or_domain == self.host:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# if either ip_or_domain or self.host are ip addresses, and they're not
|
||||||
|
# identical (previous check), not a match
|
||||||
|
try:
|
||||||
|
ipaddress.ip_address(ip_or_domain)
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
ipaddress.ip_address(self.host)
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# if we get here, we're looking at two hostnames
|
||||||
|
# XXX do we need to handle case of one punycoded idn, other not?
|
||||||
|
domain_parts = ip_or_domain.split(".")
|
||||||
|
host_parts = self.host.split(".")
|
||||||
|
|
||||||
|
return host_parts[-len(domain_parts):] == domain_parts
|
||||||
|
|
||||||
class Site(brozzler.BaseDictable):
|
class Site(brozzler.BaseDictable):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
@ -58,7 +112,7 @@ class Site(brozzler.BaseDictable):
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
self.scope["surt"] = self._to_surt(seed)
|
self.scope["surt"] = Url(seed).surt
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
|
return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
|
||||||
|
@ -69,72 +123,95 @@ class Site(brozzler.BaseDictable):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "Site-%s-%s" % (self.id, self.seed)
|
return "Site-%s-%s" % (self.id, self.seed)
|
||||||
|
|
||||||
def _to_surt(self, url):
|
|
||||||
hurl = surt.handyurl.parse(url)
|
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
|
||||||
hurl.query = None
|
|
||||||
hurl.hash = None
|
|
||||||
# XXX chop off path after last slash??
|
|
||||||
return hurl.getURLString(surt=True, trailing_comma=True)
|
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
new_scope_surt = self._to_surt(url)
|
new_scope_surt = Url(url).surt
|
||||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
if not new_scope_surt.startswith(self.scope["surt"]):
|
||||||
self.logger.info("changing site scope surt from {} to {}".format(
|
self.logger.info("changing site scope surt from {} to {}".format(
|
||||||
self.scope["surt"], new_scope_surt))
|
self.scope["surt"], new_scope_surt))
|
||||||
self.scope["surt"] = new_scope_surt
|
self.scope["surt"] = new_scope_surt
|
||||||
|
|
||||||
def is_in_scope(self, url, surt_=None, parent_page=None):
|
def is_in_scope(self, url, parent_page=None):
|
||||||
if not surt_:
|
if not isinstance(url, Url):
|
||||||
surt_ = to_surt(url)
|
u = Url(url)
|
||||||
might_accept = False
|
else:
|
||||||
|
u = url
|
||||||
|
|
||||||
if not surt_.startswith("http://") and not surt_.startswith("https://"):
|
might_accept = False
|
||||||
|
if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
# schemes?)
|
# schemes?)
|
||||||
return False
|
return False
|
||||||
elif (parent_page and "max_hops" in self.scope
|
elif (parent_page and "max_hops" in self.scope
|
||||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||||
pass
|
pass
|
||||||
elif surt_.startswith(self.scope["surt"]):
|
elif u.surt.startswith(self.scope["surt"]):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
||||||
"max_hops_off_surt", 0):
|
"max_hops_off_surt", 0):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
elif "accepts" in self.scope:
|
elif "accepts" in self.scope:
|
||||||
for rule in self.scope["accepts"]:
|
for rule in self.scope["accepts"]:
|
||||||
if self._scope_rule_applies(rule, url, surt_):
|
if self._scope_rule_applies(rule, u):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
|
break
|
||||||
|
|
||||||
if might_accept:
|
if might_accept:
|
||||||
if "blocks" in self.scope:
|
if "blocks" in self.scope:
|
||||||
for rule in self.scope["blocks"]:
|
for rule in self.scope["blocks"]:
|
||||||
if self._scope_rule_applies(rule, url, surt_):
|
if self._scope_rule_applies(rule, u):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _scope_rule_applies(self, rule, url, surt_):
|
def _scope_rule_applies(self, rule, url):
|
||||||
if not "url_match" in rule or not "value" in rule:
|
"""
|
||||||
self.logger.warn("unable to make sense of scope rule %s", rule)
|
Examples of valid rules:
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"host": "monkey.org",
|
||||||
|
"url_match": "STRING_MATCH",
|
||||||
|
"value": "bar",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"url_match": "SURT_MATCH",
|
||||||
|
"value": "+http://(com,woop,)/fuh/",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"host": "badhost.com",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
if not isinstance(url, Url):
|
||||||
|
u = Url(url)
|
||||||
|
else:
|
||||||
|
u = url
|
||||||
|
|
||||||
|
if "host" in rule and not u.matches_ip_or_domain(rule["host"]):
|
||||||
return False
|
return False
|
||||||
|
if "url_match" in rule:
|
||||||
if rule["url_match"] == "STRING_MATCH":
|
if rule["url_match"] == "STRING_MATCH":
|
||||||
return url.find(rule["value"]) >= 0
|
return u.url.find(rule["value"]) >= 0
|
||||||
elif rule["url_match"] == "REGEX_MATCH":
|
elif rule["url_match"] == "REGEX_MATCH":
|
||||||
try:
|
try:
|
||||||
return re.fullmatch(rule["value"], url)
|
return re.fullmatch(rule["value"], u.url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
"caught exception matching against regex %s: %s",
|
"caught exception matching against regex %s: %s",
|
||||||
rule["value"], e)
|
rule["value"], e)
|
||||||
return False
|
return False
|
||||||
elif rule["url_match"] == "SURT_MATCH":
|
elif rule["url_match"] == "SURT_MATCH":
|
||||||
return surt_.startswith(rule["value"])
|
return u.surt.startswith(rule["value"])
|
||||||
else:
|
else:
|
||||||
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
||||||
return False
|
return False
|
||||||
|
else:
|
||||||
|
if "host" in rule:
|
||||||
|
# we already know that it matches from earlier check
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.logger.warn("unable to make sense of scope rule %s", rule)
|
||||||
|
return False
|
||||||
|
|
||||||
class Page(brozzler.BaseDictable):
|
class Page(brozzler.BaseDictable):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -183,7 +260,3 @@ class Page(brozzler.BaseDictable):
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
||||||
return self._canon_hurl.geturl()
|
return self._canon_hurl.geturl()
|
||||||
|
|
||||||
def to_surt(url):
|
|
||||||
hurl = surt.handyurl.parse(url)
|
|
||||||
return surt.GoogleURLCanonicalizer.canonicalize(
|
|
||||||
hurl).getURLString(surt=True, trailing_comma=True)
|
|
||||||
|
|
|
@ -34,6 +34,7 @@ import socket
|
||||||
import datetime
|
import datetime
|
||||||
import collections
|
import collections
|
||||||
import requests
|
import requests
|
||||||
|
import rethinkstuff
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -20,7 +20,7 @@ import setuptools
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(name='brozzler',
|
||||||
version='1.1.dev6',
|
version='1.1.dev7',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue