diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 02f7f2d..9d8169c 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -233,8 +233,7 @@ class RethinkDbFrontier: counts = {"added":0,"updated":0,"rejected":0,"blocked":0} for url in outlinks or []: surt_ = brozzler.site.to_surt(url) - - if site.is_in_scope(surt_, parent_page): + if site.is_in_scope(url, surt_=surt_, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, url): if not surt_.startswith(site.scope["surt"]): hops_off_surt = parent_page.hops_off_surt + 1 @@ -258,8 +257,11 @@ class RethinkDbFrontier: else: counts["rejected"] += 1 - self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s", - counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page) + self.logger.info( + "%s new links added, %s existing links updated, %s links " + "rejected, %s links blocked by robots from %s", + counts["added"], counts["updated"], counts["rejected"], + counts["blocked"], parent_page) def reached_limit(self, site, e): self.logger.info("reached_limit site=%s e=%s", site, e) diff --git a/brozzler/site.py b/brozzler/site.py index 586d8b4..8ea3ab4 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -6,6 +6,7 @@ import hashlib import time import rethinkstuff import datetime +import re _EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC) @@ -64,22 +65,58 @@ class Site(brozzler.BaseDictable): self.scope["surt"], new_scope_surt)) self.scope["surt"] = new_scope_surt - def is_in_scope(self, surt_, parent_page=None): + def is_in_scope(self, url, surt_=None, parent_page=None): + if not surt_: + surt_ = to_surt(url) + might_accept = False + if not surt_.startswith("http://") and not surt_.startswith("https://"): # XXX doesn't belong here maybe (where? worker ignores unknown # schemes?) return False elif (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): - return False + pass elif surt_.startswith(self.scope["surt"]): - return True + might_accept = True elif parent_page and parent_page.hops_off_surt < self.scope.get( "max_hops_off_surt", 0): + might_accept = True + elif "accepts" in self.scope: + for rule in self.scope["accepts"]: + if self._scope_rule_applies(rule, url, surt_): + might_accept = True + + if might_accept: + if "blocks" in self.scope: + for rule in self.scope["blocks"]: + if self._scope_rule_applies(rule, url, surt_): + return False return True else: return False + def _scope_rule_applies(self, rule, url, surt_): + if not "url_match" in rule or not "value" in rule: + self.logger.warn("unable to make sense of scope rule %s", rule) + return False + if rule["url_match"] == "STRING_MATCH": + return url.find(rule["value"]) >= 0 + elif rule["url_match"] == "REGEX_MATCH": + try: + return re.fullmatch(rule["value"], url) + except Exception as e: + self.logger.warn( + "caught exception matching against regex %s: %s", + rule["value"], e) + return False + elif rule["url_match"] == "SURT_MATCH": + return surt_.startswith(rule["value"]) + else: + self.logger.warn("invalid rule.url_match=%s", rule.url_match) + return False + + class Page(brozzler.BaseDictable): def __init__( self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, diff --git a/brozzler/worker.py b/brozzler/worker.py index 1ab33d7..7355d5a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -307,8 +307,10 @@ class BrozzlerWorker: site = self._frontier.claim_site(self._id) self.logger.info("brozzling site %s", site) ydl = self._youtube_dl(site) - th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site), - name="BrowsingThread-{}".format(site.seed)) + th = threading.Thread( + target=lambda: self._brozzle_site( + browser, ydl, site), + name="BrowsingThread-{}".format(site.seed)) th.start() except: self._browser_pool.release(browser) diff --git a/setup.py b/setup.py index 1a979ce..e71d3ff 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ import setuptools import glob setuptools.setup(name='brozzler', - version='1.1.dev5', + version='1.1.dev6', description='Distributed web crawling with browsers', url='https://github.com/nlevitt/brozzler', author='Noah Levitt',