mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 12:24:20 -04:00
support for extra "blocks" and "accepts" scope rules
This commit is contained in:
parent
68abb3cb94
commit
2825ffea15
4 changed files with 51 additions and 10 deletions
|
@ -233,8 +233,7 @@ class RethinkDbFrontier:
|
||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
surt_ = brozzler.site.to_surt(url)
|
surt_ = brozzler.site.to_surt(url)
|
||||||
|
if site.is_in_scope(url, surt_=surt_, parent_page=parent_page):
|
||||||
if site.is_in_scope(surt_, parent_page):
|
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, url):
|
||||||
if not surt_.startswith(site.scope["surt"]):
|
if not surt_.startswith(site.scope["surt"]):
|
||||||
hops_off_surt = parent_page.hops_off_surt + 1
|
hops_off_surt = parent_page.hops_off_surt + 1
|
||||||
|
@ -258,8 +257,11 @@ class RethinkDbFrontier:
|
||||||
else:
|
else:
|
||||||
counts["rejected"] += 1
|
counts["rejected"] += 1
|
||||||
|
|
||||||
self.logger.info("%s new links added, %s existing links updated, %s links rejected, %s links blocked by robots from %s",
|
self.logger.info(
|
||||||
counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
|
"%s new links added, %s existing links updated, %s links "
|
||||||
|
"rejected, %s links blocked by robots from %s",
|
||||||
|
counts["added"], counts["updated"], counts["rejected"],
|
||||||
|
counts["blocked"], parent_page)
|
||||||
|
|
||||||
def reached_limit(self, site, e):
|
def reached_limit(self, site, e):
|
||||||
self.logger.info("reached_limit site=%s e=%s", site, e)
|
self.logger.info("reached_limit site=%s e=%s", site, e)
|
||||||
|
|
|
@ -6,6 +6,7 @@ import hashlib
|
||||||
import time
|
import time
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
import datetime
|
import datetime
|
||||||
|
import re
|
||||||
|
|
||||||
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC)
|
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC)
|
||||||
|
|
||||||
|
@ -64,22 +65,58 @@ class Site(brozzler.BaseDictable):
|
||||||
self.scope["surt"], new_scope_surt))
|
self.scope["surt"], new_scope_surt))
|
||||||
self.scope["surt"] = new_scope_surt
|
self.scope["surt"] = new_scope_surt
|
||||||
|
|
||||||
def is_in_scope(self, surt_, parent_page=None):
|
def is_in_scope(self, url, surt_=None, parent_page=None):
|
||||||
|
if not surt_:
|
||||||
|
surt_ = to_surt(url)
|
||||||
|
might_accept = False
|
||||||
|
|
||||||
if not surt_.startswith("http://") and not surt_.startswith("https://"):
|
if not surt_.startswith("http://") and not surt_.startswith("https://"):
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
# schemes?)
|
# schemes?)
|
||||||
return False
|
return False
|
||||||
elif (parent_page and "max_hops" in self.scope
|
elif (parent_page and "max_hops" in self.scope
|
||||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||||
return False
|
pass
|
||||||
elif surt_.startswith(self.scope["surt"]):
|
elif surt_.startswith(self.scope["surt"]):
|
||||||
return True
|
might_accept = True
|
||||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
||||||
"max_hops_off_surt", 0):
|
"max_hops_off_surt", 0):
|
||||||
|
might_accept = True
|
||||||
|
elif "accepts" in self.scope:
|
||||||
|
for rule in self.scope["accepts"]:
|
||||||
|
if self._scope_rule_applies(rule, url, surt_):
|
||||||
|
might_accept = True
|
||||||
|
|
||||||
|
if might_accept:
|
||||||
|
if "blocks" in self.scope:
|
||||||
|
for rule in self.scope["blocks"]:
|
||||||
|
if self._scope_rule_applies(rule, url, surt_):
|
||||||
|
return False
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _scope_rule_applies(self, rule, url, surt_):
|
||||||
|
if not "url_match" in rule or not "value" in rule:
|
||||||
|
self.logger.warn("unable to make sense of scope rule %s", rule)
|
||||||
|
return False
|
||||||
|
if rule["url_match"] == "STRING_MATCH":
|
||||||
|
return url.find(rule["value"]) >= 0
|
||||||
|
elif rule["url_match"] == "REGEX_MATCH":
|
||||||
|
try:
|
||||||
|
return re.fullmatch(rule["value"], url)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warn(
|
||||||
|
"caught exception matching against regex %s: %s",
|
||||||
|
rule["value"], e)
|
||||||
|
return False
|
||||||
|
elif rule["url_match"] == "SURT_MATCH":
|
||||||
|
return surt_.startswith(rule["value"])
|
||||||
|
else:
|
||||||
|
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class Page(brozzler.BaseDictable):
|
class Page(brozzler.BaseDictable):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
||||||
|
|
|
@ -307,7 +307,9 @@ class BrozzlerWorker:
|
||||||
site = self._frontier.claim_site(self._id)
|
site = self._frontier.claim_site(self._id)
|
||||||
self.logger.info("brozzling site %s", site)
|
self.logger.info("brozzling site %s", site)
|
||||||
ydl = self._youtube_dl(site)
|
ydl = self._youtube_dl(site)
|
||||||
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
|
th = threading.Thread(
|
||||||
|
target=lambda: self._brozzle_site(
|
||||||
|
browser, ydl, site),
|
||||||
name="BrowsingThread-{}".format(site.seed))
|
name="BrowsingThread-{}".format(site.seed))
|
||||||
th.start()
|
th.start()
|
||||||
except:
|
except:
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -2,7 +2,7 @@ import setuptools
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(name='brozzler',
|
||||||
version='1.1.dev5',
|
version='1.1.dev6',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue