support for extra "blocks" and "accepts" scope rules

This commit is contained in:
Noah Levitt 2016-04-21 22:22:44 +00:00
parent 68abb3cb94
commit 2825ffea15
4 changed files with 51 additions and 10 deletions

View File

@ -233,8 +233,7 @@ class RethinkDbFrontier:
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
for url in outlinks or []:
surt_ = brozzler.site.to_surt(url)
if site.is_in_scope(surt_, parent_page):
if site.is_in_scope(url, surt_=surt_, parent_page=parent_page):
if brozzler.is_permitted_by_robots(site, url):
if not surt_.startswith(site.scope["surt"]):
hops_off_surt = parent_page.hops_off_surt + 1
@ -258,8 +257,11 @@ class RethinkDbFrontier:
else:
counts["rejected"] += 1
self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s",
counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
self.logger.info(
"%s new links added, %s existing links updated, %s links "
"rejected, %s links blocked by robots from %s",
counts["added"], counts["updated"], counts["rejected"],
counts["blocked"], parent_page)
def reached_limit(self, site, e):
self.logger.info("reached_limit site=%s e=%s", site, e)

View File

@ -6,6 +6,7 @@ import hashlib
import time
import rethinkstuff
import datetime
import re
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC)
@ -64,22 +65,58 @@ class Site(brozzler.BaseDictable):
self.scope["surt"], new_scope_surt))
self.scope["surt"] = new_scope_surt
def is_in_scope(self, surt_, parent_page=None):
def is_in_scope(self, url, surt_=None, parent_page=None):
if not surt_:
surt_ = to_surt(url)
might_accept = False
if not surt_.startswith("http://") and not surt_.startswith("https://"):
# XXX doesn't belong here maybe (where? worker ignores unknown
# schemes?)
return False
elif (parent_page and "max_hops" in self.scope
and parent_page.hops_from_seed >= self.scope["max_hops"]):
return False
pass
elif surt_.startswith(self.scope["surt"]):
return True
might_accept = True
elif parent_page and parent_page.hops_off_surt < self.scope.get(
"max_hops_off_surt", 0):
might_accept = True
elif "accepts" in self.scope:
for rule in self.scope["accepts"]:
if self._scope_rule_applies(rule, url, surt_):
might_accept = True
if might_accept:
if "blocks" in self.scope:
for rule in self.scope["blocks"]:
if self._scope_rule_applies(rule, url, surt_):
return False
return True
else:
return False
def _scope_rule_applies(self, rule, url, surt_):
if not "url_match" in rule or not "value" in rule:
self.logger.warn("unable to make sense of scope rule %s", rule)
return False
if rule["url_match"] == "STRING_MATCH":
return url.find(rule["value"]) >= 0
elif rule["url_match"] == "REGEX_MATCH":
try:
return re.fullmatch(rule["value"], url)
except Exception as e:
self.logger.warn(
"caught exception matching against regex %s: %s",
rule["value"], e)
return False
elif rule["url_match"] == "SURT_MATCH":
return surt_.startswith(rule["value"])
else:
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
return False
class Page(brozzler.BaseDictable):
def __init__(
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,

View File

@ -307,8 +307,10 @@ class BrozzlerWorker:
site = self._frontier.claim_site(self._id)
self.logger.info("brozzling site %s", site)
ydl = self._youtube_dl(site)
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
name="BrowsingThread-{}".format(site.seed))
th = threading.Thread(
target=lambda: self._brozzle_site(
browser, ydl, site),
name="BrowsingThread-{}".format(site.seed))
th.start()
except:
self._browser_pool.release(browser)

View File

@ -2,7 +2,7 @@ import setuptools
import glob
setuptools.setup(name='brozzler',
version='1.1.dev5',
version='1.1.dev6',
description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler',
author='Noah Levitt',