mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Merge pull request #102 from nlevitt/docs
complete job configuration documentation
This commit is contained in:
commit
e90e7345a5
@ -9,7 +9,7 @@ before_install:
|
|||||||
- sudo pip install ansible==2.1.3.0
|
- sudo pip install ansible==2.1.3.0
|
||||||
install:
|
install:
|
||||||
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
||||||
- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest
|
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest
|
||||||
- chromium-browser --version
|
- chromium-browser --version
|
||||||
- sudo apt-get update
|
- sudo apt-get update
|
||||||
- sudo apt-get install --only-upgrade chromium-browser
|
- sudo apt-get install --only-upgrade chromium-browser
|
||||||
|
@ -291,75 +291,80 @@ class RethinkDbFrontier:
|
|||||||
{"start":doublethink.utcnow(), "stop":None})
|
{"start":doublethink.utcnow(), "stop":None})
|
||||||
site.save()
|
site.save()
|
||||||
|
|
||||||
|
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
||||||
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
|
hashtag = (url_for_crawling.hash_sign
|
||||||
|
+ url_for_crawling.fragment).decode('utf-8')
|
||||||
|
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||||
|
page = brozzler.Page(self.rr, {
|
||||||
|
'url': str(url_for_crawling),
|
||||||
|
'site_id': site.id,
|
||||||
|
'job_id': site.job_id,
|
||||||
|
'hops_from_seed': parent_page.hops_from_seed + 1,
|
||||||
|
'via_page_id': parent_page.id,
|
||||||
|
'hops_off_surt': hops_off,
|
||||||
|
'hashtags': [hashtag] if hashtag else []})
|
||||||
|
return page
|
||||||
|
|
||||||
|
def _merge_page(self, existing_page, fresh_page):
|
||||||
|
'''
|
||||||
|
Utility method for merging info from `brozzler.Page` instances
|
||||||
|
representing the same url but with possibly different metadata.
|
||||||
|
'''
|
||||||
|
existing_page.priority += fresh_page.priority
|
||||||
|
existing_page.hashtags = list(set(
|
||||||
|
existing_page.hashtags + fresh_page.hashtags))
|
||||||
|
existing_page.hops_off = min(
|
||||||
|
existing_page.hops_off, fresh_page.hops_off)
|
||||||
|
|
||||||
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
||||||
'''
|
'''
|
||||||
Returns tuple (
|
Returns tuple (
|
||||||
set of in scope urls (uncanonicalized) accepted by robots policy,
|
dict of {page_id: Page} of fresh `brozzler.Page` representing in
|
||||||
|
scope links accepted by robots policy,
|
||||||
set of in scope urls (canonicalized) blocked by robots policy,
|
set of in scope urls (canonicalized) blocked by robots policy,
|
||||||
set of out-of-scope urls (canonicalized)).
|
set of out-of-scope urls (canonicalized)).
|
||||||
'''
|
'''
|
||||||
in_scope = set()
|
pages = {} # {page_id: Page, ...}
|
||||||
blocked = set()
|
blocked = set()
|
||||||
out_of_scope = set()
|
out_of_scope = set()
|
||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
decision = site.accept_reject_or_neither(
|
||||||
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
|
url_for_scoping, parent_page=parent_page)
|
||||||
|
if decision is True:
|
||||||
|
hops_off = 0
|
||||||
|
elif decision is None:
|
||||||
|
decision = parent_page.hops_off < site.scope.get(
|
||||||
|
'max_hops_off', 0)
|
||||||
|
hops_off = parent_page.hops_off + 1
|
||||||
|
if decision is True:
|
||||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||||
in_scope.add(url)
|
fresh_page = self._build_fresh_page(
|
||||||
|
site, parent_page, url, hops_off)
|
||||||
|
if fresh_page.id in pages:
|
||||||
|
self._merge_page(pages[fresh_page.id], fresh_page)
|
||||||
|
else:
|
||||||
|
pages[fresh_page.id] = fresh_page
|
||||||
else:
|
else:
|
||||||
blocked.add(str(url_for_crawling))
|
blocked.add(str(url_for_crawling))
|
||||||
else:
|
else:
|
||||||
out_of_scope.add(str(url_for_crawling))
|
out_of_scope.add(str(url_for_crawling))
|
||||||
return in_scope, blocked, out_of_scope
|
return pages, blocked, out_of_scope
|
||||||
|
|
||||||
def _build_fresh_pages(self, site, parent_page, urls):
|
|
||||||
'''
|
|
||||||
Returns a dict of page_id => brozzler.Page.
|
|
||||||
'''
|
|
||||||
pages = {}
|
|
||||||
for url in urls:
|
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
|
||||||
hashtag = (url_for_crawling.hash_sign
|
|
||||||
+ url_for_crawling.fragment).decode('utf-8')
|
|
||||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
|
||||||
if not url_for_scoping.surt().startswith(
|
|
||||||
site.scope['surt'].encode('utf-8')):
|
|
||||||
hops_off_surt = parent_page.hops_off_surt + 1
|
|
||||||
else:
|
|
||||||
hops_off_surt = 0
|
|
||||||
page = brozzler.Page(self.rr, {
|
|
||||||
'url': str(url_for_crawling),
|
|
||||||
'site_id': site.id,
|
|
||||||
'job_id': site.job_id,
|
|
||||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
|
||||||
'via_page_id': parent_page.id,
|
|
||||||
'hops_off_surt': hops_off_surt,
|
|
||||||
'hashtags': []})
|
|
||||||
if page.id in pages:
|
|
||||||
pages[page.id].priority += page.priority
|
|
||||||
page = pages[page.id]
|
|
||||||
else:
|
|
||||||
pages[page.id] = page
|
|
||||||
if hashtag:
|
|
||||||
page.hashtags = list(set(page.hashtags + [hashtag]))
|
|
||||||
return pages
|
|
||||||
|
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
|
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
|
||||||
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
|
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
|
||||||
|
|
||||||
in_scope, blocked, out_of_scope = self._scope_and_enforce_robots(
|
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
|
||||||
site, parent_page, outlinks)
|
site, parent_page, outlinks)
|
||||||
decisions['blocked'] = blocked
|
decisions['blocked'] = blocked
|
||||||
decisions['rejected'] = out_of_scope
|
decisions['rejected'] = out_of_scope
|
||||||
counts['blocked'] += len(blocked)
|
counts['blocked'] += len(blocked)
|
||||||
counts['rejected'] += len(out_of_scope)
|
counts['rejected'] += len(out_of_scope)
|
||||||
|
|
||||||
fresh_pages = self._build_fresh_pages(site, parent_page, in_scope)
|
|
||||||
|
|
||||||
# get existing pages from rethinkdb
|
# get existing pages from rethinkdb
|
||||||
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
|
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
|
||||||
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
|
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
|
||||||
|
@ -65,7 +65,7 @@ id:
|
|||||||
max_hops:
|
max_hops:
|
||||||
type: integer
|
type: integer
|
||||||
|
|
||||||
max_hops_off_surt:
|
max_hops_off:
|
||||||
type: integer
|
type: integer
|
||||||
|
|
||||||
metadata:
|
metadata:
|
||||||
|
@ -99,7 +99,7 @@ def new_job(frontier, job_conf):
|
|||||||
|
|
||||||
def new_site(frontier, site):
|
def new_site(frontier, site):
|
||||||
site.id = str(uuid.uuid4())
|
site.id = str(uuid.uuid4())
|
||||||
logging.info("new site {}".format(site))
|
logging.info("new site %s", site)
|
||||||
# insert the Page into the database before the Site, to avoid situation
|
# insert the Page into the database before the Site, to avoid situation
|
||||||
# where a brozzler worker immediately claims the site, finds no pages
|
# where a brozzler worker immediately claims the site, finds no pages
|
||||||
# to crawl, and decides the site is finished
|
# to crawl, and decides the site is finished
|
||||||
@ -183,9 +183,24 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||||||
self.last_claimed = brozzler.EPOCH_UTC
|
self.last_claimed = brozzler.EPOCH_UTC
|
||||||
if not "scope" in self:
|
if not "scope" in self:
|
||||||
self.scope = {}
|
self.scope = {}
|
||||||
if not "surt" in self.scope and self.seed:
|
|
||||||
self.scope["surt"] = brozzler.site_surt_canon(
|
# backward compatibility
|
||||||
self.seed).surt().decode('ascii')
|
if "surt" in self.scope:
|
||||||
|
if not "accepts" in self.scope:
|
||||||
|
self.scope["accepts"] = []
|
||||||
|
self.scope["accepts"].append({"surt": self.scope["surt"]})
|
||||||
|
del self.scope["surt"]
|
||||||
|
|
||||||
|
# backward compatibility
|
||||||
|
if ("max_hops_off_surt" in self.scope
|
||||||
|
and not "max_hops_off" in self.scope):
|
||||||
|
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||||
|
if "max_hops_off_surt" in self.scope:
|
||||||
|
del self.scope["max_hops_off_surt"]
|
||||||
|
|
||||||
|
if self.seed:
|
||||||
|
self._accept_ssurt_if_not_redundant(
|
||||||
|
brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
|
||||||
|
|
||||||
if not "starts_and_stops" in self:
|
if not "starts_and_stops" in self:
|
||||||
if self.get("start_time"): # backward compatibility
|
if self.get("start_time"): # backward compatibility
|
||||||
@ -201,12 +216,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||||
|
|
||||||
|
def _accept_ssurt_if_not_redundant(self, ssurt):
|
||||||
|
if not "accepts" in self.scope:
|
||||||
|
self.scope["accepts"] = []
|
||||||
|
simple_rule_ssurts = (
|
||||||
|
rule["ssurt"] for rule in self.scope["accepts"]
|
||||||
|
if set(rule.keys()) == {'ssurt'})
|
||||||
|
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
|
||||||
|
self.logger.info(
|
||||||
|
"adding ssurt %s to scope accept rules", ssurt)
|
||||||
|
self.scope["accepts"].append({"ssurt": ssurt})
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
|
self._accept_ssurt_if_not_redundant(
|
||||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
brozzler.site_surt_canon(url).ssurt().decode('ascii'))
|
||||||
self.logger.info("changing site scope surt from {} to {}".format(
|
|
||||||
self.scope["surt"], new_scope_surt))
|
|
||||||
self.scope["surt"] = new_scope_surt
|
|
||||||
|
|
||||||
def extra_headers(self):
|
def extra_headers(self):
|
||||||
hdrs = {}
|
hdrs = {}
|
||||||
@ -215,9 +238,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||||||
self.warcprox_meta, separators=(',', ':'))
|
self.warcprox_meta, separators=(',', ':'))
|
||||||
return hdrs
|
return hdrs
|
||||||
|
|
||||||
def is_in_scope(self, url, parent_page=None):
|
def accept_reject_or_neither(self, url, parent_page=None):
|
||||||
|
'''
|
||||||
|
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
|
||||||
|
|
||||||
|
`None` usually means rejected, unless `max_hops_off` comes into play.
|
||||||
|
'''
|
||||||
if not isinstance(url, urlcanon.ParsedUrl):
|
if not isinstance(url, urlcanon.ParsedUrl):
|
||||||
url = urlcanon.semantic(url)
|
url = urlcanon.semantic(url)
|
||||||
|
|
||||||
|
if not url.scheme in (b'http', b'https'):
|
||||||
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
|
# schemes?)
|
||||||
|
return False
|
||||||
|
|
||||||
try_parent_urls = []
|
try_parent_urls = []
|
||||||
if parent_page:
|
if parent_page:
|
||||||
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
||||||
@ -225,44 +259,36 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||||||
try_parent_urls.append(
|
try_parent_urls.append(
|
||||||
urlcanon.semantic(parent_page.redirect_url))
|
urlcanon.semantic(parent_page.redirect_url))
|
||||||
|
|
||||||
might_accept = False
|
# enforce max_hops
|
||||||
if not url.scheme in (b'http', b'https'):
|
if (parent_page and "max_hops" in self.scope
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
|
||||||
# schemes?)
|
|
||||||
return False
|
|
||||||
elif (parent_page and "max_hops" in self.scope
|
|
||||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||||
pass
|
return False
|
||||||
elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
|
|
||||||
might_accept = True
|
# enforce reject rules
|
||||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
if "blocks" in self.scope:
|
||||||
"max_hops_off_surt", 0):
|
for block_rule in self.scope["blocks"]:
|
||||||
might_accept = True
|
rule = urlcanon.MatchRule(**block_rule)
|
||||||
elif "accepts" in self.scope:
|
|
||||||
for accept_rule in self.scope["accepts"]:
|
|
||||||
rule = urlcanon.MatchRule(**accept_rule)
|
|
||||||
if try_parent_urls:
|
if try_parent_urls:
|
||||||
for parent_url in try_parent_urls:
|
for parent_url in try_parent_urls:
|
||||||
if rule.applies(url, parent_url):
|
if rule.applies(url, parent_url):
|
||||||
might_accept = True
|
return False
|
||||||
else:
|
else:
|
||||||
if rule.applies(url):
|
if rule.applies(url):
|
||||||
might_accept = True
|
return False
|
||||||
|
|
||||||
if might_accept:
|
# honor accept rules
|
||||||
if "blocks" in self.scope:
|
for accept_rule in self.scope["accepts"]:
|
||||||
for block_rule in self.scope["blocks"]:
|
rule = urlcanon.MatchRule(**accept_rule)
|
||||||
rule = urlcanon.MatchRule(**block_rule)
|
if try_parent_urls:
|
||||||
if try_parent_urls:
|
for parent_url in try_parent_urls:
|
||||||
for parent_url in try_parent_urls:
|
if rule.applies(url, parent_url):
|
||||||
if rule.applies(url, parent_url):
|
return True
|
||||||
return False
|
else:
|
||||||
else:
|
if rule.applies(url):
|
||||||
if rule.applies(url):
|
return True
|
||||||
return False
|
|
||||||
return True
|
# no decision if we reach here
|
||||||
else:
|
return None
|
||||||
return False
|
|
||||||
|
|
||||||
class Page(doublethink.Document):
|
class Page(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
@ -280,8 +306,12 @@ class Page(doublethink.Document):
|
|||||||
self.brozzle_count = 0
|
self.brozzle_count = 0
|
||||||
if not "claimed" in self:
|
if not "claimed" in self:
|
||||||
self.claimed = False
|
self.claimed = False
|
||||||
if not "hops_off_surt" in self:
|
if "hops_off_surt" in self and not "hops_off" in self:
|
||||||
self.hops_off_surt = 0
|
self.hops_off = self.hops_off_surt
|
||||||
|
if "hops_off_surt" in self:
|
||||||
|
del self["hops_off_surt"]
|
||||||
|
if not "hops_off" in self:
|
||||||
|
self.hops_off = 0
|
||||||
if not "needs_robots_check" in self:
|
if not "needs_robots_check" in self:
|
||||||
self.needs_robots_check = False
|
self.needs_robots_check = False
|
||||||
if not "priority" in self:
|
if not "priority" in self:
|
||||||
|
433
job-conf.rst
433
job-conf.rst
@ -1,17 +1,19 @@
|
|||||||
brozzler job configuration
|
Brozzler Job Configuration
|
||||||
**************************
|
**************************
|
||||||
|
|
||||||
Jobs are defined using yaml files. Options may be specified either at the
|
Jobs are defined using yaml files. At least one seed url must be specified,
|
||||||
top-level or on individual seeds. At least one seed url must be specified,
|
|
||||||
everything else is optional.
|
everything else is optional.
|
||||||
|
|
||||||
an example
|
.. contents::
|
||||||
==========
|
|
||||||
|
Example
|
||||||
|
=======
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
id: myjob
|
id: myjob
|
||||||
time_limit: 60 # seconds
|
time_limit: 60 # seconds
|
||||||
|
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||||
ignore_robots: false
|
ignore_robots: false
|
||||||
max_claimed_sites: 2
|
max_claimed_sites: 2
|
||||||
warcprox_meta:
|
warcprox_meta:
|
||||||
@ -35,15 +37,14 @@ an example
|
|||||||
scope:
|
scope:
|
||||||
surt: http://(org,example,
|
surt: http://(org,example,
|
||||||
|
|
||||||
how inheritance works
|
How inheritance works
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
Most of the available options apply to seeds. Such options can also be
|
Most of the settings that apply to seeds can also be specified at the top
|
||||||
specified at the top level, in which case the seeds inherit the options. If
|
level, in which case all seeds inherit those settings. If an option is
|
||||||
an option is specified both at the top level and at the level of an individual
|
specified both at the top level and at seed level, the results are merged with
|
||||||
seed, the results are merged with the seed-level value taking precedence in
|
the seed-level value taking precedence in case of conflicts. It's probably
|
||||||
case of conflicts. It's probably easiest to make sense of this by way of an
|
easiest to make sense of this by way of an example.
|
||||||
example.
|
|
||||||
|
|
||||||
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
||||||
at the seed level for the seed http://one.example.org/. At the top level we
|
at the seed level for the seed http://one.example.org/. At the top level we
|
||||||
@ -79,101 +80,150 @@ Notice that:
|
|||||||
- Since ``buckets`` is a list, the merged result includes all the values from
|
- Since ``buckets`` is a list, the merged result includes all the values from
|
||||||
both the top level and the seed level.
|
both the top level and the seed level.
|
||||||
|
|
||||||
settings reference
|
Settings
|
||||||
==================
|
========
|
||||||
|
|
||||||
|
Top-level settings
|
||||||
|
------------------
|
||||||
|
|
||||||
``id``
|
``id``
|
||||||
------
|
~~~~~~
|
||||||
+-----------+--------+----------+--------------------------+
|
+--------+----------+--------------------------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+===========+========+==========+==========================+
|
+========+==========+==========================+
|
||||||
| top-level | string | no | *generated by rethinkdb* |
|
| string | no | *generated by rethinkdb* |
|
||||||
+-----------+--------+----------+--------------------------+
|
+--------+----------+--------------------------+
|
||||||
An arbitrary identifier for this job. Must be unique across this deployment of
|
An arbitrary identifier for this job. Must be unique across this deployment of
|
||||||
brozzler.
|
brozzler.
|
||||||
|
|
||||||
``seeds``
|
|
||||||
---------
|
|
||||||
+-----------+------------------------+----------+---------+
|
|
||||||
| scope | type | required | default |
|
|
||||||
+===========+========================+==========+=========+
|
|
||||||
| top-level | list (of dictionaries) | yes | *n/a* |
|
|
||||||
+-----------+------------------------+----------+---------+
|
|
||||||
List of seeds. Each item in the list is a dictionary (associative array) which
|
|
||||||
defines the seed. It must specify ``url`` (see below) and can additionally
|
|
||||||
specify any of the settings of scope *seed-level*.
|
|
||||||
|
|
||||||
``max_claimed_sites``
|
``max_claimed_sites``
|
||||||
---------------------
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
+-----------+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+===========+========+==========+=========+
|
+========+==========+=========+
|
||||||
| top-level | number | no | *none* |
|
| number | no | *none* |
|
||||||
+-----------+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Puts a cap on the number of sites belonging to a given job that can be brozzled
|
Puts a cap on the number of sites belonging to a given job that can be brozzled
|
||||||
simultaneously across the cluster. Addresses the problem of a job with many
|
simultaneously across the cluster. Addresses the problem of a job with many
|
||||||
seeds starving out other jobs.
|
seeds starving out other jobs.
|
||||||
|
|
||||||
|
``seeds``
|
||||||
|
~~~~~~~~~
|
||||||
|
+------------------------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========================+==========+=========+
|
||||||
|
| list (of dictionaries) | yes | *n/a* |
|
||||||
|
+------------------------+----------+---------+
|
||||||
|
List of seeds. Each item in the list is a dictionary (associative array) which
|
||||||
|
defines the seed. It must specify ``url`` (see below) and can additionally
|
||||||
|
specify any seed settings.
|
||||||
|
|
||||||
|
Seed-level-only settings
|
||||||
|
------------------------
|
||||||
|
These settings can be specified only at the seed level, unlike most seed
|
||||||
|
settings, which can also be specified at the top level.
|
||||||
|
|
||||||
``url``
|
``url``
|
||||||
-------
|
~~~~~~~
|
||||||
+------------+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+============+========+==========+=========+
|
+========+==========+=========+
|
||||||
| seed-level | string | yes | *n/a* |
|
| string | yes | *n/a* |
|
||||||
+------------+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
The seed url.
|
The seed url. Crawling starts here.
|
||||||
|
|
||||||
|
``username``
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
If set, used to populate automatically detected login forms. See explanation at
|
||||||
|
"password" below.
|
||||||
|
|
||||||
|
``password``
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
If set, used to populate automatically detected login forms. If ``username``
|
||||||
|
and ``password`` are configured for a seed, brozzler will look for a login form
|
||||||
|
on each page it crawls for that seed. A form that has a single text or email
|
||||||
|
field (the username), a single password field (``<input type="password">``),
|
||||||
|
and has ``method="POST"`` is considered to be a login form. The form may have
|
||||||
|
other fields like checkboxes and hidden fields. For these, brozzler will leave
|
||||||
|
the default values in place. Brozzler submits login forms after page load.
|
||||||
|
Then brozzling proceeds as usual.
|
||||||
|
|
||||||
|
Seed-level / top-level settings
|
||||||
|
-------------------------------
|
||||||
|
These are seed settings that can also be speficied at the top level, in which
|
||||||
|
case they are inherited by all seeds.
|
||||||
|
|
||||||
``metadata``
|
``metadata``
|
||||||
------------
|
~~~~~~~~~~~~
|
||||||
+-----------------------+------------+----------+---------+
|
+------------+----------+---------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=======================+============+==========+=========+
|
+============+==========+=========+
|
||||||
| seed-level, top-level | dictionary | no | *none* |
|
| dictionary | no | *none* |
|
||||||
+-----------------------+------------+----------+---------+
|
+------------+----------+---------+
|
||||||
Arbitrary information about the crawl job or site. Merely informative, not used
|
Arbitrary information about the crawl job or site. Merely informative, not used
|
||||||
by brozzler for anything. Could be of use to some external process.
|
by brozzler for anything. Could be of use to some external process.
|
||||||
|
|
||||||
``time_limit``
|
``time_limit``
|
||||||
--------------
|
~~~~~~~~~~~~~~
|
||||||
+-----------------------+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=======================+========+==========+=========+
|
+========+==========+=========+
|
||||||
| seed-level, top-level | number | no | *none* |
|
| number | no | *none* |
|
||||||
+-----------------------+--------+----------+---------+
|
+--------+----------+---------+
|
||||||
Time limit in seconds. If not specified, there no time limit. Time limit is
|
Time limit in seconds. If not specified, there is no time limit. Time limit is
|
||||||
enforced at the seed level. If a time limit is specified at the top level, it
|
enforced at the seed level. If a time limit is specified at the top level, it
|
||||||
is inherited by each seed as described above, and enforced individually on each
|
is inherited by each seed as described above, and enforced individually on each
|
||||||
seed.
|
seed.
|
||||||
|
|
||||||
|
``proxy``
|
||||||
|
~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
||||||
|
warcprox for archival crawling.
|
||||||
|
|
||||||
``ignore_robots``
|
``ignore_robots``
|
||||||
-----------------
|
~~~~~~~~~~~~~~~~~
|
||||||
+-----------------------+---------+----------+-----------+
|
+---------+----------+-----------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=======================+=========+==========+===========+
|
+=========+==========+===========+
|
||||||
| seed-level, top-level | boolean | no | ``false`` |
|
| boolean | no | ``false`` |
|
||||||
+-----------------------+---------+----------+-----------+
|
+---------+----------+-----------+
|
||||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||||
blocked by robots.txt rules.
|
blocked by robots.txt rules.
|
||||||
|
|
||||||
``user_agent``
|
``user_agent``
|
||||||
--------------
|
~~~~~~~~~~~~~~
|
||||||
+-----------------------+---------+----------+---------+
|
+---------+----------+---------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=======================+=========+==========+=========+
|
+=========+==========+=========+
|
||||||
| seed-level, top-level | string | no | *none* |
|
| string | no | *none* |
|
||||||
+-----------------------+---------+----------+---------+
|
+---------+----------+---------+
|
||||||
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||||
It's good ettiquette to include a project URL with a notice to webmasters that
|
It's good ettiquette to include a project URL with a notice to webmasters that
|
||||||
explains why you're crawling, how to block the crawler robots.txt and how to
|
explains why you're crawling, how to block the crawler robots.txt and how to
|
||||||
contact the operator if the crawl is causing problems.
|
contact the operator if the crawl is causing problems.
|
||||||
|
|
||||||
``warcprox_meta``
|
``warcprox_meta``
|
||||||
-----------------
|
~~~~~~~~~~~~~~~~~
|
||||||
+-----------------------+------------+----------+-----------+
|
+------------+----------+-----------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=======================+============+==========+===========+
|
+============+==========+===========+
|
||||||
| seed-level, top-level | dictionary | no | ``false`` |
|
| dictionary | no | ``false`` |
|
||||||
+-----------------------+------------+----------+-----------+
|
+------------+----------+-----------+
|
||||||
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
|
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
|
||||||
configured. The value of the Warcprox-Meta header is a json blob. It is used to
|
configured. The value of the Warcprox-Meta header is a json blob. It is used to
|
||||||
pass settings and information to warcprox. Warcprox does not forward the header
|
pass settings and information to warcprox. Warcprox does not forward the header
|
||||||
@ -195,36 +245,217 @@ becomes::
|
|||||||
Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
|
Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
|
||||||
|
|
||||||
``scope``
|
``scope``
|
||||||
---------
|
~~~~~~~~~
|
||||||
+-----------------------+------------+----------+-----------+
|
+------------+----------+-----------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=======================+============+==========+===========+
|
+============+==========+===========+
|
||||||
| seed-level, top-level | dictionary | no | ``false`` |
|
| dictionary | no | ``false`` |
|
||||||
+-----------------------+------------+----------+-----------+
|
+------------+----------+-----------+
|
||||||
Scope rules. *TODO*
|
Scope specificaion for the seed. See the "Scoping" section which follows.
|
||||||
|
|
||||||
``surt``
|
Scoping
|
||||||
--------
|
=======
|
||||||
+-------------+--------+----------+---------------------------+
|
|
||||||
| scope | type | required | default |
|
The scope of a seed determines which links are scheduled for crawling and which
|
||||||
+=============+========+==========+===========================+
|
are not. Example::
|
||||||
| scope-level | string | no | *generated from seed url* |
|
|
||||||
+-------------+--------+----------+---------------------------+
|
scope:
|
||||||
|
accepts:
|
||||||
|
- ssurt: com,example,//https:/
|
||||||
|
- parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$
|
||||||
|
regex: ^https?://(www\.)?youtube.com/watch\?.*$
|
||||||
|
- surt: http://(com,google,video,
|
||||||
|
- surt: http://(com,googlevideo,
|
||||||
|
blocks:
|
||||||
|
- domain: youngscholars.unimelb.edu.au
|
||||||
|
substring: wp-login.php?action=logout
|
||||||
|
- domain: malware.us
|
||||||
|
max_hops: 20
|
||||||
|
max_hops_off: 0
|
||||||
|
|
||||||
|
Toward the end of the process of brozzling a page, brozzler obtains a list of
|
||||||
|
navigational links (``<a href="...">`` and similar) on the page, and evaluates
|
||||||
|
each link to determine whether it is in scope or out of scope for the crawl.
|
||||||
|
Then, newly discovered links that are in scope are scheduled to be crawled, and
|
||||||
|
previously discovered links get a priority bump.
|
||||||
|
|
||||||
|
How brozzler applies scope rules
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
Each scope rule has one or more conditions. If all of the conditions match,
|
||||||
|
then the scope rule as a whole matches. For example::
|
||||||
|
|
||||||
|
- domain: youngscholars.unimelb.edu.au
|
||||||
|
substring: wp-login.php?action=logout
|
||||||
|
|
||||||
|
This rule applies if the domain of the url is "youngscholars.unimelb.edu.au" or
|
||||||
|
a subdomain, and the string "wp-login.php?action=logout" is found somewhere in
|
||||||
|
the url.
|
||||||
|
|
||||||
|
Brozzler applies these logical steps to decide whether a url is in or out of
|
||||||
|
scope:
|
||||||
|
|
||||||
|
1. If the number of hops from seed is greater than ``max_hops``, the url is
|
||||||
|
**out of scope**.
|
||||||
|
2. Otherwise, if any ``block`` rule matches, the url is **out of scope**.
|
||||||
|
3. Otherwise, if any ``accept`` rule matches, the url is **in scope**.
|
||||||
|
4. Otherwise, if the url is at most ``max_hops_off`` hops from the last page
|
||||||
|
that was in scope thanks to an ``accept`` rule, the url is **in scope**.
|
||||||
|
5. Otherwise (no rules match), the url is **out of scope**.
|
||||||
|
|
||||||
|
Notably, ``block`` rules take precedence over ``accept`` rules.
|
||||||
|
|
||||||
|
It may also be helpful to think about a list of scope rules as a boolean
|
||||||
|
expression. For example::
|
||||||
|
|
||||||
|
blocks:
|
||||||
|
- domain: youngscholars.unimelb.edu.au
|
||||||
|
substring: wp-login.php?action=logout
|
||||||
|
- domain: malware.us
|
||||||
|
|
||||||
|
means block the url IF::
|
||||||
|
|
||||||
|
("domain: youngscholars.unimelb.edu.au" AND "substring: wp-login.php?action=logout") OR "domain: malware.us"
|
||||||
|
|
||||||
|
Automatic scoping based on seed urls
|
||||||
|
------------------------------------
|
||||||
|
Brozzler usually generates an ``accept`` scope rule based on the seed url. It
|
||||||
|
does this to fulfill the usual expectation that everything "under" the seed
|
||||||
|
will be crawled.
|
||||||
|
|
||||||
|
To generate the rule, brozzler canonicalizes the seed url using the `urlcanon
|
||||||
|
<https://github.com/iipc/urlcanon>`_ library's "semantic" canonicalizer, then
|
||||||
|
removes the query string if any, and finally serializes the result in SSURT
|
||||||
|
[1]_ form. For example, a seed url of
|
||||||
|
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
|
||||||
|
``com,example,www,//https:/foo/bar?a=b&c=d``.
|
||||||
|
|
||||||
|
If the url in the browser location bar at the end of brozzling the seed page
|
||||||
|
differs from the seed url, brozzler automatically adds a second ``accept`` rule
|
||||||
|
to ensure the site is in scope, as if the new url were the original seed url.
|
||||||
|
It does this so that, for example, if ``http://example.com/`` redirects to
|
||||||
|
``http://www.example.com/``, the rest of the ``www.example.com`` is in scope.
|
||||||
|
|
||||||
|
Brozzler derives its general approach to the seed surt from Heritrix, but
|
||||||
|
differs in a few respects.
|
||||||
|
|
||||||
|
1. Unlike heritrix, brozzler does not strip the path segment after the last
|
||||||
|
slash.
|
||||||
|
2. Canonicalization does not attempt to match heritrix exactly, though it
|
||||||
|
usually does match.
|
||||||
|
3. When generating a surt for an https url, heritrix changes the scheme to
|
||||||
|
http. For example, the heritrix surt for ``https://www.example.com/`` is
|
||||||
|
``http://(com,example,www,)`` and this means that all of
|
||||||
|
``http://www.example.com/*`` and ``https://www.example.com/*`` are in
|
||||||
|
scope. It also means that a manually specified surt with scheme "https" does
|
||||||
|
not match anything. Brozzler does no scheme munging.
|
||||||
|
4. Brozzler identifies seed "redirects" by retrieving the url from the
|
||||||
|
browser's location bar at the end of brozzling the seed page, whereas
|
||||||
|
heritrix follows http 3xx redirects.
|
||||||
|
5. Brozzler uses ssurt instead of surt.
|
||||||
|
6. There is currently no brozzler option to disable the automatically generated
|
||||||
|
``accept`` rules.
|
||||||
|
|
||||||
|
Scope settings
|
||||||
|
--------------
|
||||||
|
|
||||||
``accepts``
|
``accepts``
|
||||||
-----------
|
~~~~~~~~~~~
|
||||||
+-------------+------+----------+---------+
|
+------+----------+---------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=============+======+==========+=========+
|
+======+==========+=========+
|
||||||
| scope-level | list | no | *none* |
|
| list | no | *none* |
|
||||||
+-------------+------+----------+---------+
|
+------+----------+---------+
|
||||||
|
List of scope rules. If any of the rules match, and the url is within
|
||||||
|
``max_hops`` from seed, and none of the ``block`` rules apply, the url is in
|
||||||
|
scope.
|
||||||
|
|
||||||
``blocks``
|
``blocks``
|
||||||
-----------
|
~~~~~~~~~~~
|
||||||
+-------------+------+----------+---------+
|
+------+----------+---------+
|
||||||
| scope | type | required | default |
|
| type | required | default |
|
||||||
+=============+======+==========+=========+
|
+======+==========+=========+
|
||||||
| scope-level | list | no | *none* |
|
| list | no | *none* |
|
||||||
+-------------+------+----------+---------+
|
+------+----------+---------+
|
||||||
|
List of scope rules. If any of the rules match, the url is deemed out of scope.
|
||||||
|
|
||||||
|
``max_hops``
|
||||||
|
~~~~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| number | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Maximum number of hops from seed.
|
||||||
|
|
||||||
|
``max_hops_off``
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| number | no | 0 |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Expands the scope to include urls up to this many hops from the last page that
|
||||||
|
was in scope thanks to an ``accept`` rule.
|
||||||
|
|
||||||
|
Scope rule conditions
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
``domain``
|
||||||
|
~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Matches if the host part of the canonicalized url is ``domain`` or a
|
||||||
|
subdomain.
|
||||||
|
|
||||||
|
``substring``
|
||||||
|
~~~~~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Matches if ``substring`` is found anywhere in the canonicalized url.
|
||||||
|
|
||||||
|
``regex``
|
||||||
|
~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Matches if the full canonicalized url matches ``regex``.
|
||||||
|
|
||||||
|
``ssurt``
|
||||||
|
~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Matches if the canonicalized url in SSURT [1]_ form starts with ``ssurt``.
|
||||||
|
|
||||||
|
``surt``
|
||||||
|
~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
|
||||||
|
|
||||||
|
``parent_url_regex``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
+--------+----------+---------+
|
||||||
|
| type | required | default |
|
||||||
|
+========+==========+=========+
|
||||||
|
| string | no | *none* |
|
||||||
|
+--------+----------+---------+
|
||||||
|
Matches if the full canonicalized parent url matches ``regex``. The parent url
|
||||||
|
is the url of the page in which the link was found.
|
||||||
|
|
||||||
|
.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
|
||||||
|
.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
|
||||||
|
8
setup.py
8
setup.py
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
setup.py - brozzler setup script
|
setup.py - brozzler setup script
|
||||||
|
|
||||||
Copyright (C) 2014-2017 Internet Archive
|
Copyright (C) 2014-2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -69,8 +69,8 @@ setuptools.setup(
|
|||||||
'requests',
|
'requests',
|
||||||
'websocket-client!=0.39.0',
|
'websocket-client!=0.39.0',
|
||||||
'pillow==3.3.0',
|
'pillow==3.3.0',
|
||||||
'urlcanon>=0.1.dev16',
|
'urlcanon>=0.1.dev23',
|
||||||
'doublethink>=0.2.0.dev81',
|
'doublethink>=0.2.0.dev88',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'cerberus==1.0.1',
|
'cerberus==1.0.1',
|
||||||
'jinja2',
|
'jinja2',
|
||||||
@ -79,7 +79,7 @@ setuptools.setup(
|
|||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||||
'easy': [
|
'easy': [
|
||||||
'warcprox>=2.4b1.dev145',
|
'warcprox>=2.4b2.dev173',
|
||||||
'pywb<2',
|
'pywb<2',
|
||||||
'flask>=0.11',
|
'flask>=0.11',
|
||||||
'gunicorn'
|
'gunicorn'
|
||||||
|
@ -448,13 +448,13 @@ def test_login(httpd):
|
|||||||
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
||||||
|
|
||||||
def test_seed_redirect(httpd):
|
def test_seed_redirect(httpd):
|
||||||
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
|
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
|
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port
|
assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]}
|
||||||
|
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
brozzler.new_site(frontier, site)
|
brozzler.new_site(frontier, site)
|
||||||
@ -478,7 +478,9 @@ def test_seed_redirect(httpd):
|
|||||||
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
|
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
|
||||||
|
|
||||||
# check that scope has been updated properly
|
# check that scope has been updated properly
|
||||||
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
|
assert site.scope == {'accepts': [
|
||||||
|
{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port},
|
||||||
|
{'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}
|
||||||
|
|
||||||
def test_hashtags(httpd):
|
def test_hashtags(httpd):
|
||||||
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
@ -73,9 +73,7 @@ def test_basics():
|
|||||||
'job_id': job.id,
|
'job_id': job.id,
|
||||||
'last_claimed': brozzler.EPOCH_UTC,
|
'last_claimed': brozzler.EPOCH_UTC,
|
||||||
'last_disclaimed': brozzler.EPOCH_UTC,
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||||
'scope': {
|
'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]},
|
||||||
'surt': 'http://(com,example,)/'
|
|
||||||
},
|
|
||||||
'seed': 'http://example.com',
|
'seed': 'http://example.com',
|
||||||
'starts_and_stops': [
|
'starts_and_stops': [
|
||||||
{
|
{
|
||||||
@ -91,9 +89,7 @@ def test_basics():
|
|||||||
'job_id': job.id,
|
'job_id': job.id,
|
||||||
'last_claimed': brozzler.EPOCH_UTC,
|
'last_claimed': brozzler.EPOCH_UTC,
|
||||||
'last_disclaimed': brozzler.EPOCH_UTC,
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||||
'scope': {
|
'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]},
|
||||||
'surt': 'https://(org,example,)/',
|
|
||||||
},
|
|
||||||
'seed': 'https://example.org/',
|
'seed': 'https://example.org/',
|
||||||
'starts_and_stops': [
|
'starts_and_stops': [
|
||||||
{
|
{
|
||||||
@ -110,7 +106,7 @@ def test_basics():
|
|||||||
'brozzle_count': 0,
|
'brozzle_count': 0,
|
||||||
'claimed': False,
|
'claimed': False,
|
||||||
'hops_from_seed': 0,
|
'hops_from_seed': 0,
|
||||||
'hops_off_surt': 0,
|
'hops_off': 0,
|
||||||
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
||||||
'job_id': job.id,
|
'job_id': job.id,
|
||||||
'needs_robots_check': True,
|
'needs_robots_check': True,
|
||||||
@ -124,7 +120,7 @@ def test_basics():
|
|||||||
'brozzle_count': 0,
|
'brozzle_count': 0,
|
||||||
'claimed': False,
|
'claimed': False,
|
||||||
'hops_from_seed': 0,
|
'hops_from_seed': 0,
|
||||||
'hops_off_surt': 0,
|
'hops_off': 0,
|
||||||
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
||||||
'job_id': job.id,
|
'job_id': job.id,
|
||||||
'needs_robots_check': True,
|
'needs_robots_check': True,
|
||||||
@ -443,8 +439,7 @@ def test_field_defaults():
|
|||||||
brozzler.Site.table_ensure(rr)
|
brozzler.Site.table_ensure(rr)
|
||||||
site = brozzler.Site(rr, {'seed': 'http://example.com/'})
|
site = brozzler.Site(rr, {'seed': 'http://example.com/'})
|
||||||
assert site.id is None
|
assert site.id is None
|
||||||
assert site.scope
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/'}]}
|
||||||
assert site.scope['surt'] == 'http://(com,example,)/'
|
|
||||||
site.save()
|
site.save()
|
||||||
assert site.id
|
assert site.id
|
||||||
assert site.scope
|
assert site.scope
|
||||||
@ -638,11 +633,15 @@ def test_completed_page():
|
|||||||
'hops_from_seed': 0,
|
'hops_from_seed': 0,
|
||||||
'redirect_url':'http://example.com/b/', })
|
'redirect_url':'http://example.com/b/', })
|
||||||
page.save()
|
page.save()
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||||
frontier.completed_page(site, page)
|
frontier.completed_page(site, page)
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
assert site.scope == {'accepts': [
|
||||||
|
{'ssurt': 'com,example,//http:/a/'},
|
||||||
|
{'ssurt': 'com,example,//http:/b/'}]}
|
||||||
site.refresh()
|
site.refresh()
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
assert site.scope == {'accepts': [
|
||||||
|
{'ssurt': 'com,example,//http:/a/'},
|
||||||
|
{'ssurt': 'com,example,//http:/b/'}]}
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed == False
|
||||||
page.refresh()
|
page.refresh()
|
||||||
@ -661,11 +660,11 @@ def test_completed_page():
|
|||||||
'hops_from_seed': 0,
|
'hops_from_seed': 0,
|
||||||
'redirect_url':'http://example.com/a/x/', })
|
'redirect_url':'http://example.com/a/x/', })
|
||||||
page.save()
|
page.save()
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||||
frontier.completed_page(site, page)
|
frontier.completed_page(site, page)
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||||
site.refresh()
|
site.refresh()
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed == False
|
||||||
page.refresh()
|
page.refresh()
|
||||||
@ -683,11 +682,11 @@ def test_completed_page():
|
|||||||
'hops_from_seed': 1,
|
'hops_from_seed': 1,
|
||||||
'redirect_url':'http://example.com/d/', })
|
'redirect_url':'http://example.com/d/', })
|
||||||
page.save()
|
page.save()
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||||
frontier.completed_page(site, page)
|
frontier.completed_page(site, page)
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||||
site.refresh()
|
site.refresh()
|
||||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed == False
|
||||||
page.refresh()
|
page.refresh()
|
||||||
@ -727,7 +726,7 @@ def test_hashtag_seed():
|
|||||||
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||||
brozzler.new_site(frontier, site)
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
assert site.scope['surt'] == 'http://(org,example,)/'
|
assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}
|
||||||
|
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 1
|
assert len(pages) == 1
|
||||||
@ -738,7 +737,7 @@ def test_hashtag_seed():
|
|||||||
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
|
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
|
||||||
brozzler.new_site(frontier, site)
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
assert site.scope['surt'] == 'http://(org,example,)/'
|
assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}
|
||||||
|
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 1
|
assert len(pages) == 1
|
||||||
@ -908,7 +907,7 @@ def test_choose_warcprox():
|
|||||||
svcreg = doublethink.ServiceRegistry(rr)
|
svcreg = doublethink.ServiceRegistry(rr)
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
# avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
# avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
||||||
rr.table('sites').wait().run()
|
rr.table('sites').wait().run()
|
||||||
rr.table('services').wait().run()
|
rr.table('services').wait().run()
|
||||||
rr.table('sites').index_wait().run()
|
rr.table('sites').index_wait().run()
|
||||||
@ -978,3 +977,136 @@ def test_choose_warcprox():
|
|||||||
# clean up
|
# clean up
|
||||||
rr.table('sites').delete().run()
|
rr.table('sites').delete().run()
|
||||||
rr.table('services').delete().run()
|
rr.table('services').delete().run()
|
||||||
|
|
||||||
|
def test_max_hops_off():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
site = brozzler.Site(rr, {
|
||||||
|
'seed': 'http://example.com/',
|
||||||
|
'scope': {
|
||||||
|
'max_hops_off_surt': 1,
|
||||||
|
'blocks': [{'ssurt': 'domain,bad,'}]}})
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
site.refresh() # get it back from the db
|
||||||
|
|
||||||
|
# renamed this param
|
||||||
|
assert not 'max_hops_off_surt' in site.scope
|
||||||
|
assert site.scope['max_hops_off'] == 1
|
||||||
|
|
||||||
|
seed_page = frontier.seed_page(site.id)
|
||||||
|
|
||||||
|
assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None
|
||||||
|
assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None
|
||||||
|
assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True
|
||||||
|
assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False
|
||||||
|
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
# two of these are in scope because of max_hops_off
|
||||||
|
frontier.scope_and_schedule_outlinks(site, seed_page, [
|
||||||
|
'http://foo.org/', 'https://example.com/toot',
|
||||||
|
'http://example.com/toot', 'https://some.bad.domain/something'])
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
|
||||||
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||||
|
|
||||||
|
assert len(pages) == 4
|
||||||
|
assert pages[0].url == 'http://example.com/'
|
||||||
|
assert pages[0].hops_off == 0
|
||||||
|
assert not 'hops_off_surt' in pages[0]
|
||||||
|
assert set(pages[0].outlinks['accepted']) == {
|
||||||
|
'https://example.com/toot', 'http://foo.org/',
|
||||||
|
'http://example.com/toot'}
|
||||||
|
assert pages[0].outlinks['blocked'] == []
|
||||||
|
assert pages[0].outlinks['rejected'] == [
|
||||||
|
'https://some.bad.domain/something']
|
||||||
|
assert {
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'claimed': False,
|
||||||
|
'hashtags': [],
|
||||||
|
'hops_from_seed': 1,
|
||||||
|
'hops_off': 0,
|
||||||
|
'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'),
|
||||||
|
'job_id': None,
|
||||||
|
'needs_robots_check': False,
|
||||||
|
'priority': 12,
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/toot',
|
||||||
|
'via_page_id': seed_page.id
|
||||||
|
} in pages
|
||||||
|
assert {
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'claimed': False,
|
||||||
|
'hashtags': [],
|
||||||
|
'hops_from_seed': 1,
|
||||||
|
'hops_off': 1,
|
||||||
|
'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
|
||||||
|
'job_id': None,
|
||||||
|
'needs_robots_check': False,
|
||||||
|
'priority': 12,
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://foo.org/',
|
||||||
|
'via_page_id': seed_page.id
|
||||||
|
} in pages
|
||||||
|
assert {
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'claimed': False,
|
||||||
|
'hashtags': [],
|
||||||
|
'hops_from_seed': 1,
|
||||||
|
'hops_off': 1,
|
||||||
|
'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'),
|
||||||
|
'job_id': None,
|
||||||
|
'needs_robots_check': False,
|
||||||
|
'priority': 12,
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'https://example.com/toot',
|
||||||
|
'via_page_id': seed_page.id
|
||||||
|
} in pages
|
||||||
|
|
||||||
|
# next hop is past max_hops_off, but normal in scope url is in scope
|
||||||
|
foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0]
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, foo_page, [
|
||||||
|
'http://foo.org/bar', 'http://example.com/blah'])
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
assert foo_page == {
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'claimed': False,
|
||||||
|
'hashtags': [],
|
||||||
|
'hops_from_seed': 1,
|
||||||
|
'hops_off': 1,
|
||||||
|
'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
|
||||||
|
'job_id': None,
|
||||||
|
'needs_robots_check': False,
|
||||||
|
'priority': 12,
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://foo.org/',
|
||||||
|
'via_page_id': seed_page.id,
|
||||||
|
'outlinks': {
|
||||||
|
'accepted': ['http://example.com/blah'],
|
||||||
|
'blocked': [],
|
||||||
|
'rejected': ['http://foo.org/bar'],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||||
|
assert len(pages) == 5
|
||||||
|
assert {
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'claimed': False,
|
||||||
|
'hashtags': [],
|
||||||
|
'hops_from_seed': 2,
|
||||||
|
'hops_off': 0,
|
||||||
|
'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'),
|
||||||
|
'job_id': None,
|
||||||
|
'needs_robots_check': False,
|
||||||
|
'priority': 11,
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/blah',
|
||||||
|
'via_page_id': foo_page.id
|
||||||
|
} in pages
|
||||||
|
|
||||||
|
@ -94,28 +94,28 @@ blocks:
|
|||||||
'url': 'http://example.com/foo/bar?baz=quux#monkey',
|
'url': 'http://example.com/foo/bar?baz=quux#monkey',
|
||||||
'site_id': site.id})
|
'site_id': site.id})
|
||||||
|
|
||||||
assert site.is_in_scope('http://example.com/foo/bar', page)
|
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
|
||||||
assert not site.is_in_scope('http://example.com/foo/baz', page)
|
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
|
||||||
|
|
||||||
assert not site.is_in_scope('http://foo.com/some.mp3', page)
|
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
|
||||||
assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)
|
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
|
||||||
|
|
||||||
assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
|
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
|
||||||
assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)
|
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
|
||||||
|
|
||||||
assert site.is_in_scope('https://twitter.com/twit', page)
|
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
|
||||||
assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
|
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
|
||||||
assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)
|
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
|
||||||
|
|
||||||
assert site.is_in_scope('https://www.facebook.com/whatevz', page)
|
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
|
||||||
|
|
||||||
assert not site.is_in_scope(
|
assert site.accept_reject_or_neither(
|
||||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page)
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
|
||||||
yt_user_page = brozzler.Page(None, {
|
yt_user_page = brozzler.Page(None, {
|
||||||
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
|
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
|
||||||
'site_id': site.id, 'hops_from_seed': 10})
|
'site_id': site.id, 'hops_from_seed': 10})
|
||||||
assert site.is_in_scope(
|
assert site.accept_reject_or_neither(
|
||||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
|
||||||
|
|
||||||
def test_proxy_down():
|
def test_proxy_down():
|
||||||
'''
|
'''
|
||||||
|
Loading…
x
Reference in New Issue
Block a user