mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: lowercase readme.rst explain brozzler use of warcprox_meta update README copyright date bump dev version after PR #102 these ssurts are strings too fix bad copy/paste ssurts are strings now travis-ci install warcprox from github incorporate urlcanon fix update warcprox dependency to include recent fixes backward compatibility for old scope["surt"] missed a spot where is_permitted_by_robots needs monkeying handle new chrome cookie db schema describe scope rule conditions more explication of scoping update docs to match new seed ssurt behavior ok seriously tests fix more tests for new approach sans scope['surt'] s/max_hops_off_surt/max_hops_off/ new test of max_hops_off rename page.hops_off_surt to page.hops_off doublethink had a bug fix tests for new approach without scope['surt'] tests for new approach without of scope['surt'] WIP add an accept rule instead of modifying surt WIP some words on scoping WIP starting to flesh out "scoping" section WIP some explanation of automatic login WIP documentation!
This commit is contained in:
commit
5c34bd3119
@ -9,7 +9,7 @@ before_install:
|
||||
- sudo pip install ansible==2.1.3.0
|
||||
install:
|
||||
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
||||
- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest
|
||||
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest
|
||||
- chromium-browser --version
|
||||
- sudo apt-get update
|
||||
- sudo apt-get install --only-upgrade chromium-browser
|
||||
|
@ -112,9 +112,17 @@ class Chrome:
|
||||
try:
|
||||
with sqlite3.connect(cookie_location) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute('UPDATE cookies SET persistent = 1')
|
||||
cur.execute('UPDATE cookies SET is_persistent = 1')
|
||||
except sqlite3.Error:
|
||||
self.logger.error('exception updating cookie DB', exc_info=True)
|
||||
try:
|
||||
# db schema changed around version 66, this is the old schema
|
||||
with sqlite3.connect(cookie_location) as conn:
|
||||
cur = conn.cursor()
|
||||
cur.execute('UPDATE cookies SET persistent = 1')
|
||||
except sqlite3.Error:
|
||||
self.logger.error(
|
||||
'exception updating cookie DB %s', cookie_location,
|
||||
exc_info=True)
|
||||
|
||||
cookie_db = None
|
||||
try:
|
||||
|
@ -24,7 +24,7 @@ try:
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[dashboard]".\nSee README.rst for more information.',
|
||||
'brozzler[dashboard]".\nSee readme.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
import doublethink
|
||||
|
@ -31,7 +31,7 @@ try:
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
'brozzler[easy]".\nSee readme.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
import argparse
|
||||
|
@ -291,75 +291,80 @@ class RethinkDbFrontier:
|
||||
{"start":doublethink.utcnow(), "stop":None})
|
||||
site.save()
|
||||
|
||||
def _build_fresh_page(self, site, parent_page, url, hops_off=0):
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
url_for_crawling = urlcanon.whatwg(url)
|
||||
hashtag = (url_for_crawling.hash_sign
|
||||
+ url_for_crawling.fragment).decode('utf-8')
|
||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||
page = brozzler.Page(self.rr, {
|
||||
'url': str(url_for_crawling),
|
||||
'site_id': site.id,
|
||||
'job_id': site.job_id,
|
||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
||||
'via_page_id': parent_page.id,
|
||||
'hops_off_surt': hops_off,
|
||||
'hashtags': [hashtag] if hashtag else []})
|
||||
return page
|
||||
|
||||
def _merge_page(self, existing_page, fresh_page):
|
||||
'''
|
||||
Utility method for merging info from `brozzler.Page` instances
|
||||
representing the same url but with possibly different metadata.
|
||||
'''
|
||||
existing_page.priority += fresh_page.priority
|
||||
existing_page.hashtags = list(set(
|
||||
existing_page.hashtags + fresh_page.hashtags))
|
||||
existing_page.hops_off = min(
|
||||
existing_page.hops_off, fresh_page.hops_off)
|
||||
|
||||
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
||||
'''
|
||||
Returns tuple (
|
||||
set of in scope urls (uncanonicalized) accepted by robots policy,
|
||||
dict of {page_id: Page} of fresh `brozzler.Page` representing in
|
||||
scope links accepted by robots policy,
|
||||
set of in scope urls (canonicalized) blocked by robots policy,
|
||||
set of out-of-scope urls (canonicalized)).
|
||||
'''
|
||||
in_scope = set()
|
||||
pages = {} # {page_id: Page, ...}
|
||||
blocked = set()
|
||||
out_of_scope = set()
|
||||
for url in outlinks or []:
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
url_for_crawling = urlcanon.whatwg(url)
|
||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
|
||||
decision = site.accept_reject_or_neither(
|
||||
url_for_scoping, parent_page=parent_page)
|
||||
if decision is True:
|
||||
hops_off = 0
|
||||
elif decision is None:
|
||||
decision = parent_page.hops_off < site.scope.get(
|
||||
'max_hops_off', 0)
|
||||
hops_off = parent_page.hops_off + 1
|
||||
if decision is True:
|
||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||
in_scope.add(url)
|
||||
fresh_page = self._build_fresh_page(
|
||||
site, parent_page, url, hops_off)
|
||||
if fresh_page.id in pages:
|
||||
self._merge_page(pages[fresh_page.id], fresh_page)
|
||||
else:
|
||||
pages[fresh_page.id] = fresh_page
|
||||
else:
|
||||
blocked.add(str(url_for_crawling))
|
||||
else:
|
||||
out_of_scope.add(str(url_for_crawling))
|
||||
return in_scope, blocked, out_of_scope
|
||||
|
||||
def _build_fresh_pages(self, site, parent_page, urls):
|
||||
'''
|
||||
Returns a dict of page_id => brozzler.Page.
|
||||
'''
|
||||
pages = {}
|
||||
for url in urls:
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
url_for_crawling = urlcanon.whatwg(url)
|
||||
hashtag = (url_for_crawling.hash_sign
|
||||
+ url_for_crawling.fragment).decode('utf-8')
|
||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||
if not url_for_scoping.surt().startswith(
|
||||
site.scope['surt'].encode('utf-8')):
|
||||
hops_off_surt = parent_page.hops_off_surt + 1
|
||||
else:
|
||||
hops_off_surt = 0
|
||||
page = brozzler.Page(self.rr, {
|
||||
'url': str(url_for_crawling),
|
||||
'site_id': site.id,
|
||||
'job_id': site.job_id,
|
||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
||||
'via_page_id': parent_page.id,
|
||||
'hops_off_surt': hops_off_surt,
|
||||
'hashtags': []})
|
||||
if page.id in pages:
|
||||
pages[page.id].priority += page.priority
|
||||
page = pages[page.id]
|
||||
else:
|
||||
pages[page.id] = page
|
||||
if hashtag:
|
||||
page.hashtags = list(set(page.hashtags + [hashtag]))
|
||||
return pages
|
||||
return pages, blocked, out_of_scope
|
||||
|
||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||
decisions = {'accepted':set(),'blocked':set(),'rejected':set()}
|
||||
counts = {'added':0,'updated':0,'rejected':0,'blocked':0}
|
||||
|
||||
in_scope, blocked, out_of_scope = self._scope_and_enforce_robots(
|
||||
fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots(
|
||||
site, parent_page, outlinks)
|
||||
decisions['blocked'] = blocked
|
||||
decisions['rejected'] = out_of_scope
|
||||
counts['blocked'] += len(blocked)
|
||||
counts['rejected'] += len(out_of_scope)
|
||||
|
||||
fresh_pages = self._build_fresh_pages(site, parent_page, in_scope)
|
||||
|
||||
# get existing pages from rethinkdb
|
||||
results = self.rr.table('pages').get_all(*fresh_pages.keys()).run()
|
||||
pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results}
|
||||
|
@ -65,7 +65,7 @@ id:
|
||||
max_hops:
|
||||
type: integer
|
||||
|
||||
max_hops_off_surt:
|
||||
max_hops_off:
|
||||
type: integer
|
||||
|
||||
metadata:
|
||||
|
@ -99,7 +99,7 @@ def new_job(frontier, job_conf):
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
logging.info("new site {}".format(site))
|
||||
logging.info("new site %s", site)
|
||||
# insert the Page into the database before the Site, to avoid situation
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
@ -183,9 +183,24 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
self.last_claimed = brozzler.EPOCH_UTC
|
||||
if not "scope" in self:
|
||||
self.scope = {}
|
||||
if not "surt" in self.scope and self.seed:
|
||||
self.scope["surt"] = brozzler.site_surt_canon(
|
||||
self.seed).surt().decode('ascii')
|
||||
|
||||
# backward compatibility
|
||||
if "surt" in self.scope:
|
||||
if not "accepts" in self.scope:
|
||||
self.scope["accepts"] = []
|
||||
self.scope["accepts"].append({"surt": self.scope["surt"]})
|
||||
del self.scope["surt"]
|
||||
|
||||
# backward compatibility
|
||||
if ("max_hops_off_surt" in self.scope
|
||||
and not "max_hops_off" in self.scope):
|
||||
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
|
||||
if "max_hops_off_surt" in self.scope:
|
||||
del self.scope["max_hops_off_surt"]
|
||||
|
||||
if self.seed:
|
||||
self._accept_ssurt_if_not_redundant(
|
||||
brozzler.site_surt_canon(self.seed).ssurt().decode('ascii'))
|
||||
|
||||
if not "starts_and_stops" in self:
|
||||
if self.get("start_time"): # backward compatibility
|
||||
@ -201,12 +216,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
def __str__(self):
|
||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||
|
||||
def _accept_ssurt_if_not_redundant(self, ssurt):
|
||||
if not "accepts" in self.scope:
|
||||
self.scope["accepts"] = []
|
||||
simple_rule_ssurts = (
|
||||
rule["ssurt"] for rule in self.scope["accepts"]
|
||||
if set(rule.keys()) == {'ssurt'})
|
||||
if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts):
|
||||
self.logger.info(
|
||||
"adding ssurt %s to scope accept rules", ssurt)
|
||||
self.scope["accepts"].append({"ssurt": ssurt})
|
||||
|
||||
def note_seed_redirect(self, url):
|
||||
new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
|
||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
||||
self.logger.info("changing site scope surt from {} to {}".format(
|
||||
self.scope["surt"], new_scope_surt))
|
||||
self.scope["surt"] = new_scope_surt
|
||||
self._accept_ssurt_if_not_redundant(
|
||||
brozzler.site_surt_canon(url).ssurt().decode('ascii'))
|
||||
|
||||
def extra_headers(self):
|
||||
hdrs = {}
|
||||
@ -215,9 +238,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
self.warcprox_meta, separators=(',', ':'))
|
||||
return hdrs
|
||||
|
||||
def is_in_scope(self, url, parent_page=None):
|
||||
def accept_reject_or_neither(self, url, parent_page=None):
|
||||
'''
|
||||
Returns `True` (accepted), `False` (rejected), or `None` (no decision).
|
||||
|
||||
`None` usually means rejected, unless `max_hops_off` comes into play.
|
||||
'''
|
||||
if not isinstance(url, urlcanon.ParsedUrl):
|
||||
url = urlcanon.semantic(url)
|
||||
|
||||
if not url.scheme in (b'http', b'https'):
|
||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||
# schemes?)
|
||||
return False
|
||||
|
||||
try_parent_urls = []
|
||||
if parent_page:
|
||||
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
||||
@ -225,44 +259,36 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
try_parent_urls.append(
|
||||
urlcanon.semantic(parent_page.redirect_url))
|
||||
|
||||
might_accept = False
|
||||
if not url.scheme in (b'http', b'https'):
|
||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||
# schemes?)
|
||||
return False
|
||||
elif (parent_page and "max_hops" in self.scope
|
||||
# enforce max_hops
|
||||
if (parent_page and "max_hops" in self.scope
|
||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||
pass
|
||||
elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
|
||||
might_accept = True
|
||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
||||
"max_hops_off_surt", 0):
|
||||
might_accept = True
|
||||
elif "accepts" in self.scope:
|
||||
for accept_rule in self.scope["accepts"]:
|
||||
rule = urlcanon.MatchRule(**accept_rule)
|
||||
return False
|
||||
|
||||
# enforce reject rules
|
||||
if "blocks" in self.scope:
|
||||
for block_rule in self.scope["blocks"]:
|
||||
rule = urlcanon.MatchRule(**block_rule)
|
||||
if try_parent_urls:
|
||||
for parent_url in try_parent_urls:
|
||||
if rule.applies(url, parent_url):
|
||||
might_accept = True
|
||||
return False
|
||||
else:
|
||||
if rule.applies(url):
|
||||
might_accept = True
|
||||
return False
|
||||
|
||||
if might_accept:
|
||||
if "blocks" in self.scope:
|
||||
for block_rule in self.scope["blocks"]:
|
||||
rule = urlcanon.MatchRule(**block_rule)
|
||||
if try_parent_urls:
|
||||
for parent_url in try_parent_urls:
|
||||
if rule.applies(url, parent_url):
|
||||
return False
|
||||
else:
|
||||
if rule.applies(url):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
# honor accept rules
|
||||
for accept_rule in self.scope["accepts"]:
|
||||
rule = urlcanon.MatchRule(**accept_rule)
|
||||
if try_parent_urls:
|
||||
for parent_url in try_parent_urls:
|
||||
if rule.applies(url, parent_url):
|
||||
return True
|
||||
else:
|
||||
if rule.applies(url):
|
||||
return True
|
||||
|
||||
# no decision if we reach here
|
||||
return None
|
||||
|
||||
class Page(doublethink.Document):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
@ -280,8 +306,12 @@ class Page(doublethink.Document):
|
||||
self.brozzle_count = 0
|
||||
if not "claimed" in self:
|
||||
self.claimed = False
|
||||
if not "hops_off_surt" in self:
|
||||
self.hops_off_surt = 0
|
||||
if "hops_off_surt" in self and not "hops_off" in self:
|
||||
self.hops_off = self.hops_off_surt
|
||||
if "hops_off_surt" in self:
|
||||
del self["hops_off_surt"]
|
||||
if not "hops_off" in self:
|
||||
self.hops_off = 0
|
||||
if not "needs_robots_check" in self:
|
||||
self.needs_robots_check = False
|
||||
if not "priority" in self:
|
||||
|
@ -31,7 +31,7 @@ try:
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
'brozzler[easy]".\nSee README.rst for more information.',
|
||||
'brozzler[easy]".\nSee readme.rst for more information.',
|
||||
type(e).__name__, e)
|
||||
sys.exit(1)
|
||||
import doublethink
|
||||
@ -270,7 +270,7 @@ Run pywb like so:
|
||||
|
||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||
|
||||
See README.rst for more information.
|
||||
See readme.rst for more information.
|
||||
'''
|
||||
|
||||
# copied and pasted from cdxdomainspecific.py, only changes are commented as
|
||||
|
485
job-conf.rst
485
job-conf.rst
@ -1,17 +1,19 @@
|
||||
brozzler job configuration
|
||||
Brozzler Job Configuration
|
||||
**************************
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. At least one seed url must be specified,
|
||||
Jobs are defined using yaml files. At least one seed url must be specified,
|
||||
everything else is optional.
|
||||
|
||||
an example
|
||||
==========
|
||||
.. contents::
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
::
|
||||
|
||||
id: myjob
|
||||
time_limit: 60 # seconds
|
||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||
ignore_robots: false
|
||||
max_claimed_sites: 2
|
||||
warcprox_meta:
|
||||
@ -35,15 +37,14 @@ an example
|
||||
scope:
|
||||
surt: http://(org,example,
|
||||
|
||||
how inheritance works
|
||||
How inheritance works
|
||||
=====================
|
||||
|
||||
Most of the available options apply to seeds. Such options can also be
|
||||
specified at the top level, in which case the seeds inherit the options. If
|
||||
an option is specified both at the top level and at the level of an individual
|
||||
seed, the results are merged with the seed-level value taking precedence in
|
||||
case of conflicts. It's probably easiest to make sense of this by way of an
|
||||
example.
|
||||
Most of the settings that apply to seeds can also be specified at the top
|
||||
level, in which case all seeds inherit those settings. If an option is
|
||||
specified both at the top level and at seed level, the results are merged with
|
||||
the seed-level value taking precedence in case of conflicts. It's probably
|
||||
easiest to make sense of this by way of an example.
|
||||
|
||||
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
||||
at the seed level for the seed http://one.example.org/. At the top level we
|
||||
@ -79,106 +80,155 @@ Notice that:
|
||||
- Since ``buckets`` is a list, the merged result includes all the values from
|
||||
both the top level and the seed level.
|
||||
|
||||
settings reference
|
||||
==================
|
||||
Settings
|
||||
========
|
||||
|
||||
Top-level settings
|
||||
------------------
|
||||
|
||||
``id``
|
||||
------
|
||||
+-----------+--------+----------+--------------------------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+==========================+
|
||||
| top-level | string | no | *generated by rethinkdb* |
|
||||
+-----------+--------+----------+--------------------------+
|
||||
~~~~~~
|
||||
+--------+----------+--------------------------+
|
||||
| type | required | default |
|
||||
+========+==========+==========================+
|
||||
| string | no | *generated by rethinkdb* |
|
||||
+--------+----------+--------------------------+
|
||||
An arbitrary identifier for this job. Must be unique across this deployment of
|
||||
brozzler.
|
||||
|
||||
``seeds``
|
||||
---------
|
||||
+-----------+------------------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+===========+========================+==========+=========+
|
||||
| top-level | list (of dictionaries) | yes | *n/a* |
|
||||
+-----------+------------------------+----------+---------+
|
||||
List of seeds. Each item in the list is a dictionary (associative array) which
|
||||
defines the seed. It must specify ``url`` (see below) and can additionally
|
||||
specify any of the settings of scope *seed-level*.
|
||||
|
||||
``max_claimed_sites``
|
||||
---------------------
|
||||
+-----------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+=========+
|
||||
| top-level | number | no | *none* |
|
||||
+-----------+--------+----------+---------+
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| number | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Puts a cap on the number of sites belonging to a given job that can be brozzled
|
||||
simultaneously across the cluster. Addresses the problem of a job with many
|
||||
seeds starving out other jobs.
|
||||
|
||||
``seeds``
|
||||
~~~~~~~~~
|
||||
+------------------------+----------+---------+
|
||||
| type | required | default |
|
||||
+========================+==========+=========+
|
||||
| list (of dictionaries) | yes | *n/a* |
|
||||
+------------------------+----------+---------+
|
||||
List of seeds. Each item in the list is a dictionary (associative array) which
|
||||
defines the seed. It must specify ``url`` (see below) and can additionally
|
||||
specify any seed settings.
|
||||
|
||||
Seed-level-only settings
|
||||
------------------------
|
||||
These settings can be specified only at the seed level, unlike most seed
|
||||
settings, which can also be specified at the top level.
|
||||
|
||||
``url``
|
||||
-------
|
||||
+------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+============+========+==========+=========+
|
||||
| seed-level | string | yes | *n/a* |
|
||||
+------------+--------+----------+---------+
|
||||
The seed url.
|
||||
~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | yes | *n/a* |
|
||||
+--------+----------+---------+
|
||||
The seed url. Crawling starts here.
|
||||
|
||||
``username``
|
||||
~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
If set, used to populate automatically detected login forms. See explanation at
|
||||
"password" below.
|
||||
|
||||
``password``
|
||||
~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
If set, used to populate automatically detected login forms. If ``username``
|
||||
and ``password`` are configured for a seed, brozzler will look for a login form
|
||||
on each page it crawls for that seed. A form that has a single text or email
|
||||
field (the username), a single password field (``<input type="password">``),
|
||||
and has ``method="POST"`` is considered to be a login form. The form may have
|
||||
other fields like checkboxes and hidden fields. For these, brozzler will leave
|
||||
the default values in place. Brozzler submits login forms after page load.
|
||||
Then brozzling proceeds as usual.
|
||||
|
||||
Seed-level / top-level settings
|
||||
-------------------------------
|
||||
These are seed settings that can also be speficied at the top level, in which
|
||||
case they are inherited by all seeds.
|
||||
|
||||
``metadata``
|
||||
------------
|
||||
+-----------------------+------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+=========+
|
||||
| seed-level, top-level | dictionary | no | *none* |
|
||||
+-----------------------+------------+----------+---------+
|
||||
~~~~~~~~~~~~
|
||||
+------------+----------+---------+
|
||||
| type | required | default |
|
||||
+============+==========+=========+
|
||||
| dictionary | no | *none* |
|
||||
+------------+----------+---------+
|
||||
Arbitrary information about the crawl job or site. Merely informative, not used
|
||||
by brozzler for anything. Could be of use to some external process.
|
||||
|
||||
``time_limit``
|
||||
--------------
|
||||
+-----------------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+========+==========+=========+
|
||||
| seed-level, top-level | number | no | *none* |
|
||||
+-----------------------+--------+----------+---------+
|
||||
Time limit in seconds. If not specified, there no time limit. Time limit is
|
||||
~~~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| number | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Time limit in seconds. If not specified, there is no time limit. Time limit is
|
||||
enforced at the seed level. If a time limit is specified at the top level, it
|
||||
is inherited by each seed as described above, and enforced individually on each
|
||||
seed.
|
||||
|
||||
``proxy``
|
||||
~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
||||
warcprox for archival crawling.
|
||||
|
||||
``ignore_robots``
|
||||
-----------------
|
||||
+-----------------------+---------+----------+-----------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+===========+
|
||||
| seed-level, top-level | boolean | no | ``false`` |
|
||||
+-----------------------+---------+----------+-----------+
|
||||
~~~~~~~~~~~~~~~~~
|
||||
+---------+----------+-----------+
|
||||
| type | required | default |
|
||||
+=========+==========+===========+
|
||||
| boolean | no | ``false`` |
|
||||
+---------+----------+-----------+
|
||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||
blocked by robots.txt rules.
|
||||
|
||||
``user_agent``
|
||||
--------------
|
||||
+-----------------------+---------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+=========+
|
||||
| seed-level, top-level | string | no | *none* |
|
||||
+-----------------------+---------+----------+---------+
|
||||
~~~~~~~~~~~~~~
|
||||
+---------+----------+---------+
|
||||
| type | required | default |
|
||||
+=========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+---------+----------+---------+
|
||||
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||
It's good ettiquette to include a project URL with a notice to webmasters that
|
||||
explains why you're crawling, how to block the crawler robots.txt and how to
|
||||
contact the operator if the crawl is causing problems.
|
||||
|
||||
``warcprox_meta``
|
||||
-----------------
|
||||
+-----------------------+------------+----------+-----------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+===========+
|
||||
| seed-level, top-level | dictionary | no | ``false`` |
|
||||
+-----------------------+------------+----------+-----------+
|
||||
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
|
||||
configured. The value of the Warcprox-Meta header is a json blob. It is used to
|
||||
pass settings and information to warcprox. Warcprox does not forward the header
|
||||
on to the remote site. See the warcprox docs for more information (XXX not yet
|
||||
written).
|
||||
~~~~~~~~~~~~~~~~~
|
||||
+------------+----------+-----------+
|
||||
| type | required | default |
|
||||
+============+==========+===========+
|
||||
| dictionary | no | ``false`` |
|
||||
+------------+----------+-----------+
|
||||
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
||||
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||
used to pass settings and information to warcprox. Warcprox does not forward
|
||||
the header on to the remote site. For full documentation on ``warcprox-meta``
|
||||
see https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
|
||||
|
||||
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
||||
json and populates the Warcprox-Meta header with that value. For example::
|
||||
@ -195,36 +245,259 @@ becomes::
|
||||
Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
|
||||
|
||||
``scope``
|
||||
---------
|
||||
+-----------------------+------------+----------+-----------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+===========+
|
||||
| seed-level, top-level | dictionary | no | ``false`` |
|
||||
+-----------------------+------------+----------+-----------+
|
||||
Scope rules. *TODO*
|
||||
~~~~~~~~~
|
||||
+------------+----------+-----------+
|
||||
| type | required | default |
|
||||
+============+==========+===========+
|
||||
| dictionary | no | ``false`` |
|
||||
+------------+----------+-----------+
|
||||
Scope specificaion for the seed. See the "Scoping" section which follows.
|
||||
|
||||
``surt``
|
||||
--------
|
||||
+-------------+--------+----------+---------------------------+
|
||||
| scope | type | required | default |
|
||||
+=============+========+==========+===========================+
|
||||
| scope-level | string | no | *generated from seed url* |
|
||||
+-------------+--------+----------+---------------------------+
|
||||
Scoping
|
||||
=======
|
||||
|
||||
The scope of a seed determines which links are scheduled for crawling and which
|
||||
are not. Example::
|
||||
|
||||
scope:
|
||||
accepts:
|
||||
- ssurt: com,example,//https:/
|
||||
- parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$
|
||||
regex: ^https?://(www\.)?youtube.com/watch\?.*$
|
||||
- surt: http://(com,google,video,
|
||||
- surt: http://(com,googlevideo,
|
||||
blocks:
|
||||
- domain: youngscholars.unimelb.edu.au
|
||||
substring: wp-login.php?action=logout
|
||||
- domain: malware.us
|
||||
max_hops: 20
|
||||
max_hops_off: 0
|
||||
|
||||
Toward the end of the process of brozzling a page, brozzler obtains a list of
|
||||
navigational links (``<a href="...">`` and similar) on the page, and evaluates
|
||||
each link to determine whether it is in scope or out of scope for the crawl.
|
||||
Then, newly discovered links that are in scope are scheduled to be crawled, and
|
||||
previously discovered links get a priority bump.
|
||||
|
||||
How brozzler applies scope rules
|
||||
--------------------------------
|
||||
|
||||
Each scope rule has one or more conditions. If all of the conditions match,
|
||||
then the scope rule as a whole matches. For example::
|
||||
|
||||
- domain: youngscholars.unimelb.edu.au
|
||||
substring: wp-login.php?action=logout
|
||||
|
||||
This rule applies if the domain of the url is "youngscholars.unimelb.edu.au" or
|
||||
a subdomain, and the string "wp-login.php?action=logout" is found somewhere in
|
||||
the url.
|
||||
|
||||
Brozzler applies these logical steps to decide whether a url is in or out of
|
||||
scope:
|
||||
|
||||
1. If the number of hops from seed is greater than ``max_hops``, the url is
|
||||
**out of scope**.
|
||||
2. Otherwise, if any ``block`` rule matches, the url is **out of scope**.
|
||||
3. Otherwise, if any ``accept`` rule matches, the url is **in scope**.
|
||||
4. Otherwise, if the url is at most ``max_hops_off`` hops from the last page
|
||||
that was in scope thanks to an ``accept`` rule, the url is **in scope**.
|
||||
5. Otherwise (no rules match), the url is **out of scope**.
|
||||
|
||||
Notably, ``block`` rules take precedence over ``accept`` rules.
|
||||
|
||||
It may also be helpful to think about a list of scope rules as a boolean
|
||||
expression. For example::
|
||||
|
||||
blocks:
|
||||
- domain: youngscholars.unimelb.edu.au
|
||||
substring: wp-login.php?action=logout
|
||||
- domain: malware.us
|
||||
|
||||
means block the url IF::
|
||||
|
||||
("domain: youngscholars.unimelb.edu.au" AND "substring: wp-login.php?action=logout") OR "domain: malware.us"
|
||||
|
||||
Automatic scoping based on seed urls
|
||||
------------------------------------
|
||||
Brozzler usually generates an ``accept`` scope rule based on the seed url. It
|
||||
does this to fulfill the usual expectation that everything "under" the seed
|
||||
will be crawled.
|
||||
|
||||
To generate the rule, brozzler canonicalizes the seed url using the `urlcanon
|
||||
<https://github.com/iipc/urlcanon>`_ library's "semantic" canonicalizer, then
|
||||
removes the query string if any, and finally serializes the result in SSURT
|
||||
[1]_ form. For example, a seed url of
|
||||
``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes
|
||||
``com,example,www,//https:/foo/bar?a=b&c=d``.
|
||||
|
||||
If the url in the browser location bar at the end of brozzling the seed page
|
||||
differs from the seed url, brozzler automatically adds a second ``accept`` rule
|
||||
to ensure the site is in scope, as if the new url were the original seed url.
|
||||
It does this so that, for example, if ``http://example.com/`` redirects to
|
||||
``http://www.example.com/``, the rest of the ``www.example.com`` is in scope.
|
||||
|
||||
Brozzler derives its general approach to the seed surt from Heritrix, but
|
||||
differs in a few respects.
|
||||
|
||||
1. Unlike heritrix, brozzler does not strip the path segment after the last
|
||||
slash.
|
||||
2. Canonicalization does not attempt to match heritrix exactly, though it
|
||||
usually does match.
|
||||
3. When generating a surt for an https url, heritrix changes the scheme to
|
||||
http. For example, the heritrix surt for ``https://www.example.com/`` is
|
||||
``http://(com,example,www,)`` and this means that all of
|
||||
``http://www.example.com/*`` and ``https://www.example.com/*`` are in
|
||||
scope. It also means that a manually specified surt with scheme "https" does
|
||||
not match anything. Brozzler does no scheme munging.
|
||||
4. Brozzler identifies seed "redirects" by retrieving the url from the
|
||||
browser's location bar at the end of brozzling the seed page, whereas
|
||||
heritrix follows http 3xx redirects.
|
||||
5. Brozzler uses ssurt instead of surt.
|
||||
6. There is currently no brozzler option to disable the automatically generated
|
||||
``accept`` rules.
|
||||
|
||||
Scope settings
|
||||
--------------
|
||||
|
||||
``accepts``
|
||||
-----------
|
||||
+-------------+------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=============+======+==========+=========+
|
||||
| scope-level | list | no | *none* |
|
||||
+-------------+------+----------+---------+
|
||||
~~~~~~~~~~~
|
||||
+------+----------+---------+
|
||||
| type | required | default |
|
||||
+======+==========+=========+
|
||||
| list | no | *none* |
|
||||
+------+----------+---------+
|
||||
List of scope rules. If any of the rules match, and the url is within
|
||||
``max_hops`` from seed, and none of the ``block`` rules apply, the url is in
|
||||
scope.
|
||||
|
||||
``blocks``
|
||||
-----------
|
||||
+-------------+------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=============+======+==========+=========+
|
||||
| scope-level | list | no | *none* |
|
||||
+-------------+------+----------+---------+
|
||||
~~~~~~~~~~~
|
||||
+------+----------+---------+
|
||||
| type | required | default |
|
||||
+======+==========+=========+
|
||||
| list | no | *none* |
|
||||
+------+----------+---------+
|
||||
List of scope rules. If any of the rules match, the url is deemed out of scope.
|
||||
|
||||
``max_hops``
|
||||
~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| number | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Maximum number of hops from seed.
|
||||
|
||||
``max_hops_off``
|
||||
~~~~~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| number | no | 0 |
|
||||
+--------+----------+---------+
|
||||
Expands the scope to include urls up to this many hops from the last page that
|
||||
was in scope thanks to an ``accept`` rule.
|
||||
|
||||
Scope rule conditions
|
||||
---------------------
|
||||
|
||||
``domain``
|
||||
~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Matches if the host part of the canonicalized url is ``domain`` or a
|
||||
subdomain.
|
||||
|
||||
``substring``
|
||||
~~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Matches if ``substring`` is found anywhere in the canonicalized url.
|
||||
|
||||
``regex``
|
||||
~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Matches if the full canonicalized url matches ``regex``.
|
||||
|
||||
``ssurt``
|
||||
~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Matches if the canonicalized url in SSURT [1]_ form starts with ``ssurt``.
|
||||
|
||||
``surt``
|
||||
~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Matches if the canonicalized url in SURT [2]_ form starts with ``surt``.
|
||||
|
||||
``parent_url_regex``
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
Matches if the full canonicalized parent url matches ``regex``. The parent url
|
||||
is the url of the page in which the link was found.
|
||||
|
||||
Using ``warcprox_meta``
|
||||
=======================
|
||||
``warcprox_meta`` deserves some more discussion. It plays a very important role
|
||||
in brozzler job configuration. ``warcprox_meta`` is the way you set the
|
||||
filenames of the warcs for your crawl. For example, if each seed should have a
|
||||
different warc name prefix, you might have a job configured this way::
|
||||
|
||||
seeds:
|
||||
- url: https://example.com/
|
||||
warcprox_meta:
|
||||
warc-prefix: seed1
|
||||
- url: https://archive.org/
|
||||
warcprox_meta:
|
||||
warc-prefix: seed2
|
||||
|
||||
``warcprox_meta`` is also the way to put limits on the size of the crawl job.
|
||||
For example, this configuration will stop the crawl after about 100 MB of novel
|
||||
content has been crawled::
|
||||
|
||||
seeds:
|
||||
- url: https://example.com/
|
||||
- url: https://archive.org/
|
||||
warcprox_meta:
|
||||
stats:
|
||||
buckets:
|
||||
- my-job
|
||||
limits:
|
||||
my-job/new/wire_bytes: 100000000
|
||||
|
||||
To prevent any urls from a host from being captured, it's not sufficient to use
|
||||
a ``scope`` rule as described above. That kind of scoping only applies to
|
||||
navigational links discovered in crawled pages. To make absolutely sure no url
|
||||
from a given host is fetched, not even (say) an image embedded in a page, use
|
||||
``warcprox_meta`` like so::
|
||||
|
||||
warcprox_meta:
|
||||
blocks:
|
||||
- domain: spammy.com
|
||||
|
||||
For complete documentation on the ``warcprox-meta`` request header, see
|
||||
https://github.com/internetarchive/warcprox/blob/master/api.rst#warcprox-meta-http-request-header
|
||||
|
||||
.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst
|
||||
.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html
|
||||
|
@ -69,27 +69,19 @@ does not take advantage of brozzler's distributed nature.*
|
||||
Installation and Usage
|
||||
----------------------
|
||||
|
||||
To install brozzler only:
|
||||
|
||||
::
|
||||
To install brozzler only::
|
||||
|
||||
pip install brozzler # in a virtualenv if desired
|
||||
|
||||
Launch one or more workers:
|
||||
|
||||
::
|
||||
Launch one or more workers::
|
||||
|
||||
brozzler-worker --warcprox-auto
|
||||
|
||||
Submit jobs:
|
||||
|
||||
::
|
||||
Submit jobs::
|
||||
|
||||
brozzler-new-job myjob.yaml
|
||||
|
||||
Submit sites not tied to a job:
|
||||
|
||||
::
|
||||
Submit sites not tied to a job::
|
||||
|
||||
brozzler-new-site --time-limit=600 http://example.com/
|
||||
|
||||
@ -208,7 +200,7 @@ Chrome's regular mode instead.
|
||||
License
|
||||
-------
|
||||
|
||||
Copyright 2015-2017 Internet Archive
|
||||
Copyright 2015-2018 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
not use this software except in compliance with the License. You may
|
12
setup.py
12
setup.py
@ -2,7 +2,7 @@
|
||||
'''
|
||||
setup.py - brozzler setup script
|
||||
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
Copyright (C) 2014-2018 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -32,12 +32,12 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b13.dev288',
|
||||
version='1.1b13.dev290',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
author_email='nlevitt@archive.org',
|
||||
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
||||
long_description=open('readme.rst', mode='rb').read().decode('UTF-8'),
|
||||
license='Apache License 2.0',
|
||||
packages=['brozzler', 'brozzler.dashboard'],
|
||||
package_data={
|
||||
@ -69,8 +69,8 @@ setuptools.setup(
|
||||
'requests',
|
||||
'websocket-client!=0.39.0',
|
||||
'pillow==3.3.0',
|
||||
'urlcanon>=0.1.dev16',
|
||||
'doublethink>=0.2.0.dev81',
|
||||
'urlcanon>=0.1.dev23',
|
||||
'doublethink>=0.2.0.dev88',
|
||||
'rethinkdb>=2.3,<2.4',
|
||||
'cerberus==1.0.1',
|
||||
'jinja2',
|
||||
@ -79,7 +79,7 @@ setuptools.setup(
|
||||
extras_require={
|
||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||
'easy': [
|
||||
'warcprox>=2.4b1.dev145',
|
||||
'warcprox>=2.4b2.dev173',
|
||||
'pywb<2',
|
||||
'flask>=0.11',
|
||||
'gunicorn'
|
||||
|
@ -448,13 +448,13 @@ def test_login(httpd):
|
||||
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
||||
|
||||
def test_seed_redirect(httpd):
|
||||
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
|
||||
test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
|
||||
site = brozzler.Site(rr, {
|
||||
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
|
||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port
|
||||
assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]}
|
||||
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
brozzler.new_site(frontier, site)
|
||||
@ -478,7 +478,9 @@ def test_seed_redirect(httpd):
|
||||
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
|
||||
|
||||
# check that scope has been updated properly
|
||||
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port},
|
||||
{'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}
|
||||
|
||||
def test_hashtags(httpd):
|
||||
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
||||
|
@ -73,9 +73,7 @@ def test_basics():
|
||||
'job_id': job.id,
|
||||
'last_claimed': brozzler.EPOCH_UTC,
|
||||
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||
'scope': {
|
||||
'surt': 'http://(com,example,)/'
|
||||
},
|
||||
'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]},
|
||||
'seed': 'http://example.com',
|
||||
'starts_and_stops': [
|
||||
{
|
||||
@ -91,9 +89,7 @@ def test_basics():
|
||||
'job_id': job.id,
|
||||
'last_claimed': brozzler.EPOCH_UTC,
|
||||
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||
'scope': {
|
||||
'surt': 'https://(org,example,)/',
|
||||
},
|
||||
'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]},
|
||||
'seed': 'https://example.org/',
|
||||
'starts_and_stops': [
|
||||
{
|
||||
@ -110,7 +106,7 @@ def test_basics():
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hops_from_seed': 0,
|
||||
'hops_off_surt': 0,
|
||||
'hops_off': 0,
|
||||
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
||||
'job_id': job.id,
|
||||
'needs_robots_check': True,
|
||||
@ -124,7 +120,7 @@ def test_basics():
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hops_from_seed': 0,
|
||||
'hops_off_surt': 0,
|
||||
'hops_off': 0,
|
||||
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
||||
'job_id': job.id,
|
||||
'needs_robots_check': True,
|
||||
@ -443,8 +439,7 @@ def test_field_defaults():
|
||||
brozzler.Site.table_ensure(rr)
|
||||
site = brozzler.Site(rr, {'seed': 'http://example.com/'})
|
||||
assert site.id is None
|
||||
assert site.scope
|
||||
assert site.scope['surt'] == 'http://(com,example,)/'
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/'}]}
|
||||
site.save()
|
||||
assert site.id
|
||||
assert site.scope
|
||||
@ -638,11 +633,15 @@ def test_completed_page():
|
||||
'hops_from_seed': 0,
|
||||
'redirect_url':'http://example.com/b/', })
|
||||
page.save()
|
||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||
frontier.completed_page(site, page)
|
||||
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,example,//http:/a/'},
|
||||
{'ssurt': 'com,example,//http:/b/'}]}
|
||||
site.refresh()
|
||||
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
||||
assert site.scope == {'accepts': [
|
||||
{'ssurt': 'com,example,//http:/a/'},
|
||||
{'ssurt': 'com,example,//http:/b/'}]}
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
page.refresh()
|
||||
@ -661,11 +660,11 @@ def test_completed_page():
|
||||
'hops_from_seed': 0,
|
||||
'redirect_url':'http://example.com/a/x/', })
|
||||
page.save()
|
||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||
frontier.completed_page(site, page)
|
||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||
site.refresh()
|
||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
page.refresh()
|
||||
@ -683,11 +682,11 @@ def test_completed_page():
|
||||
'hops_from_seed': 1,
|
||||
'redirect_url':'http://example.com/d/', })
|
||||
page.save()
|
||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||
frontier.completed_page(site, page)
|
||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||
site.refresh()
|
||||
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||
assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]}
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
page.refresh()
|
||||
@ -727,7 +726,7 @@ def test_hashtag_seed():
|
||||
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
assert site.scope['surt'] == 'http://(org,example,)/'
|
||||
assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}
|
||||
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 1
|
||||
@ -738,7 +737,7 @@ def test_hashtag_seed():
|
||||
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
assert site.scope['surt'] == 'http://(org,example,)/'
|
||||
assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]}
|
||||
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 1
|
||||
@ -908,7 +907,7 @@ def test_choose_warcprox():
|
||||
svcreg = doublethink.ServiceRegistry(rr)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
||||
# avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
||||
rr.table('sites').wait().run()
|
||||
rr.table('services').wait().run()
|
||||
rr.table('sites').index_wait().run()
|
||||
@ -978,3 +977,136 @@ def test_choose_warcprox():
|
||||
# clean up
|
||||
rr.table('sites').delete().run()
|
||||
rr.table('services').delete().run()
|
||||
|
||||
def test_max_hops_off():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(rr, {
|
||||
'seed': 'http://example.com/',
|
||||
'scope': {
|
||||
'max_hops_off_surt': 1,
|
||||
'blocks': [{'ssurt': 'domain,bad,'}]}})
|
||||
brozzler.new_site(frontier, site)
|
||||
site.refresh() # get it back from the db
|
||||
|
||||
# renamed this param
|
||||
assert not 'max_hops_off_surt' in site.scope
|
||||
assert site.scope['max_hops_off'] == 1
|
||||
|
||||
seed_page = frontier.seed_page(site.id)
|
||||
|
||||
assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None
|
||||
assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None
|
||||
assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True
|
||||
assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False
|
||||
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
# two of these are in scope because of max_hops_off
|
||||
frontier.scope_and_schedule_outlinks(site, seed_page, [
|
||||
'http://foo.org/', 'https://example.com/toot',
|
||||
'http://example.com/toot', 'https://some.bad.domain/something'])
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
|
||||
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||
|
||||
assert len(pages) == 4
|
||||
assert pages[0].url == 'http://example.com/'
|
||||
assert pages[0].hops_off == 0
|
||||
assert not 'hops_off_surt' in pages[0]
|
||||
assert set(pages[0].outlinks['accepted']) == {
|
||||
'https://example.com/toot', 'http://foo.org/',
|
||||
'http://example.com/toot'}
|
||||
assert pages[0].outlinks['blocked'] == []
|
||||
assert pages[0].outlinks['rejected'] == [
|
||||
'https://some.bad.domain/something']
|
||||
assert {
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hashtags': [],
|
||||
'hops_from_seed': 1,
|
||||
'hops_off': 0,
|
||||
'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'),
|
||||
'job_id': None,
|
||||
'needs_robots_check': False,
|
||||
'priority': 12,
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/toot',
|
||||
'via_page_id': seed_page.id
|
||||
} in pages
|
||||
assert {
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hashtags': [],
|
||||
'hops_from_seed': 1,
|
||||
'hops_off': 1,
|
||||
'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
|
||||
'job_id': None,
|
||||
'needs_robots_check': False,
|
||||
'priority': 12,
|
||||
'site_id': site.id,
|
||||
'url': 'http://foo.org/',
|
||||
'via_page_id': seed_page.id
|
||||
} in pages
|
||||
assert {
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hashtags': [],
|
||||
'hops_from_seed': 1,
|
||||
'hops_off': 1,
|
||||
'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'),
|
||||
'job_id': None,
|
||||
'needs_robots_check': False,
|
||||
'priority': 12,
|
||||
'site_id': site.id,
|
||||
'url': 'https://example.com/toot',
|
||||
'via_page_id': seed_page.id
|
||||
} in pages
|
||||
|
||||
# next hop is past max_hops_off, but normal in scope url is in scope
|
||||
foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0]
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
frontier.scope_and_schedule_outlinks(site, foo_page, [
|
||||
'http://foo.org/bar', 'http://example.com/blah'])
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
assert foo_page == {
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hashtags': [],
|
||||
'hops_from_seed': 1,
|
||||
'hops_off': 1,
|
||||
'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'),
|
||||
'job_id': None,
|
||||
'needs_robots_check': False,
|
||||
'priority': 12,
|
||||
'site_id': site.id,
|
||||
'url': 'http://foo.org/',
|
||||
'via_page_id': seed_page.id,
|
||||
'outlinks': {
|
||||
'accepted': ['http://example.com/blah'],
|
||||
'blocked': [],
|
||||
'rejected': ['http://foo.org/bar'],
|
||||
}
|
||||
}
|
||||
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||
assert len(pages) == 5
|
||||
assert {
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hashtags': [],
|
||||
'hops_from_seed': 2,
|
||||
'hops_off': 0,
|
||||
'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'),
|
||||
'job_id': None,
|
||||
'needs_robots_check': False,
|
||||
'priority': 11,
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/blah',
|
||||
'via_page_id': foo_page.id
|
||||
} in pages
|
||||
|
||||
|
@ -94,28 +94,28 @@ blocks:
|
||||
'url': 'http://example.com/foo/bar?baz=quux#monkey',
|
||||
'site_id': site.id})
|
||||
|
||||
assert site.is_in_scope('http://example.com/foo/bar', page)
|
||||
assert not site.is_in_scope('http://example.com/foo/baz', page)
|
||||
assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True
|
||||
assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None
|
||||
|
||||
assert not site.is_in_scope('http://foo.com/some.mp3', page)
|
||||
assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page)
|
||||
assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None
|
||||
assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True
|
||||
|
||||
assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page)
|
||||
assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page)
|
||||
assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True
|
||||
assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None
|
||||
|
||||
assert site.is_in_scope('https://twitter.com/twit', page)
|
||||
assert site.is_in_scope('https://twitter.com/twit?lang=en', page)
|
||||
assert not site.is_in_scope('https://twitter.com/twit?lang=es', page)
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True
|
||||
assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False
|
||||
|
||||
assert site.is_in_scope('https://www.facebook.com/whatevz', page)
|
||||
assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True
|
||||
|
||||
assert not site.is_in_scope(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page)
|
||||
assert site.accept_reject_or_neither(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None
|
||||
yt_user_page = brozzler.Page(None, {
|
||||
'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO',
|
||||
'site_id': site.id, 'hops_from_seed': 10})
|
||||
assert site.is_in_scope(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
|
||||
assert site.accept_reject_or_neither(
|
||||
'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True
|
||||
|
||||
def test_proxy_down():
|
||||
'''
|
||||
|
Loading…
x
Reference in New Issue
Block a user