From 914289b414f291a0c8fd10f0d102db75586ad5b3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 19 Mar 2018 14:14:37 -0700 Subject: [PATCH 01/24] WIP documentation! --- job-conf.rst | 280 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 190 insertions(+), 90 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index 1174f1a..670b476 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -1,17 +1,19 @@ -brozzler job configuration +Brozzler Job Configuration ************************** -Jobs are defined using yaml files. Options may be specified either at the -top-level or on individual seeds. At least one seed url must be specified, +Jobs are defined using yaml files. At least one seed url must be specified, everything else is optional. -an example -========== +.. contents:: + +Example +======= :: id: myjob time_limit: 60 # seconds + proxy: 127.0.0.1:8000 # point at warcprox for archiving ignore_robots: false max_claimed_sites: 2 warcprox_meta: @@ -35,7 +37,7 @@ an example scope: surt: http://(org,example, -how inheritance works +How inheritance works ===================== Most of the available options apply to seeds. Such options can also be @@ -79,101 +81,140 @@ Notice that: - Since ``buckets`` is a list, the merged result includes all the values from both the top level and the seed level. -settings reference +Settings reference ================== +Top-level settings +------------------ + ``id`` ------- -+-----------+--------+----------+--------------------------+ -| scope | type | required | default | -+===========+========+==========+==========================+ -| top-level | string | no | *generated by rethinkdb* | -+-----------+--------+----------+--------------------------+ +~~~~~~ ++--------+----------+--------------------------+ +| type | required | default | ++========+==========+==========================+ +| string | no | *generated by rethinkdb* | ++--------+----------+--------------------------+ An arbitrary identifier for this job. Must be unique across this deployment of brozzler. -``seeds`` ---------- -+-----------+------------------------+----------+---------+ -| scope | type | required | default | -+===========+========================+==========+=========+ -| top-level | list (of dictionaries) | yes | *n/a* | -+-----------+------------------------+----------+---------+ -List of seeds. Each item in the list is a dictionary (associative array) which -defines the seed. It must specify ``url`` (see below) and can additionally -specify any of the settings of scope *seed-level*. - ``max_claimed_sites`` ---------------------- -+-----------+--------+----------+---------+ -| scope | type | required | default | -+===========+========+==========+=========+ -| top-level | number | no | *none* | -+-----------+--------+----------+---------+ +~~~~~~~~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | *none* | ++--------+----------+---------+ Puts a cap on the number of sites belonging to a given job that can be brozzled simultaneously across the cluster. Addresses the problem of a job with many seeds starving out other jobs. +``seeds`` +~~~~~~~~~ ++------------------------+----------+---------+ +| type | required | default | ++========================+==========+=========+ +| list (of dictionaries) | yes | *n/a* | ++------------------------+----------+---------+ +List of seeds. Each item in the list is a dictionary (associative array) which +defines the seed. It must specify ``url`` (see below) and can additionally +specify any *seed* settings. + +Seed-level-only settings +------------------------ +These settings can be specified only at the seed level, unlike most seed +settings, which can also be specified at the top level. + ``url`` -------- -+------------+--------+----------+---------+ -| scope | type | required | default | -+============+========+==========+=========+ -| seed-level | string | yes | *n/a* | -+------------+--------+----------+---------+ +~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | yes | *n/a* | ++--------+----------+---------+ The seed url. +``username`` +~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + +``password`` +~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + +Seed-level / top-level settings +------------------------------- +These are seed settings that can also be speficied at the top level, in which +case they are inherited by all seeds. + ``metadata`` ------------- -+-----------------------+------------+----------+---------+ -| scope | type | required | default | -+=======================+============+==========+=========+ -| seed-level, top-level | dictionary | no | *none* | -+-----------------------+------------+----------+---------+ +~~~~~~~~~~~~ ++------------+----------+---------+ +| type | required | default | ++============+==========+=========+ +| dictionary | no | *none* | ++------------+----------+---------+ Arbitrary information about the crawl job or site. Merely informative, not used by brozzler for anything. Could be of use to some external process. ``time_limit`` --------------- -+-----------------------+--------+----------+---------+ -| scope | type | required | default | -+=======================+========+==========+=========+ -| seed-level, top-level | number | no | *none* | -+-----------------------+--------+----------+---------+ +~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | *none* | ++--------+----------+---------+ Time limit in seconds. If not specified, there no time limit. Time limit is enforced at the seed level. If a time limit is specified at the top level, it is inherited by each seed as described above, and enforced individually on each seed. +``proxy`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +HTTP proxy, with the format ``host:port``. Typically configured to point to +warcprox for archival crawling. + ``ignore_robots`` ------------------ -+-----------------------+---------+----------+-----------+ -| scope | type | required | default | -+=======================+=========+==========+===========+ -| seed-level, top-level | boolean | no | ``false`` | -+-----------------------+---------+----------+-----------+ +~~~~~~~~~~~~~~~~~ ++---------+----------+-----------+ +| type | required | default | ++=========+==========+===========+ +| boolean | no | ``false`` | ++---------+----------+-----------+ If set to ``true``, brozzler will happily crawl pages that would otherwise be blocked by robots.txt rules. ``user_agent`` --------------- -+-----------------------+---------+----------+---------+ -| scope | type | required | default | -+=======================+=========+==========+=========+ -| seed-level, top-level | string | no | *none* | -+-----------------------+---------+----------+---------+ +~~~~~~~~~~~~~~ ++---------+----------+---------+ +| type | required | default | ++=========+==========+=========+ +| string | no | *none* | ++---------+----------+---------+ The ``User-Agent`` header brozzler will send to identify itself to web servers. It's good ettiquette to include a project URL with a notice to webmasters that explains why you're crawling, how to block the crawler robots.txt and how to contact the operator if the crawl is causing problems. ``warcprox_meta`` ------------------ -+-----------------------+------------+----------+-----------+ -| scope | type | required | default | -+=======================+============+==========+===========+ -| seed-level, top-level | dictionary | no | ``false`` | -+-----------------------+------------+----------+-----------+ +~~~~~~~~~~~~~~~~~ ++------------+----------+-----------+ +| type | required | default | ++============+==========+===========+ +| dictionary | no | ``false`` | ++------------+----------+-----------+ Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is configured. The value of the Warcprox-Meta header is a json blob. It is used to pass settings and information to warcprox. Warcprox does not forward the header @@ -195,36 +236,95 @@ becomes:: Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}} ``scope`` ---------- -+-----------------------+------------+----------+-----------+ -| scope | type | required | default | -+=======================+============+==========+===========+ -| seed-level, top-level | dictionary | no | ``false`` | -+-----------------------+------------+----------+-----------+ +~~~~~~~~~ ++------------+----------+-----------+ +| type | required | default | ++============+==========+===========+ +| dictionary | no | ``false`` | ++------------+----------+-----------+ Scope rules. *TODO* +Scoping +======= + +*TODO* explanation of scoping and scope rules + +Scope settings +-------------- + ``surt`` --------- -+-------------+--------+----------+---------------------------+ -| scope | type | required | default | -+=============+========+==========+===========================+ -| scope-level | string | no | *generated from seed url* | -+-------------+--------+----------+---------------------------+ +~~~~~~~~ ++--------+----------+---------------------------+ +| type | required | default | ++========+==========+===========================+ +| string | no | *generated from seed url* | ++--------+----------+---------------------------+ ``accepts`` ------------ -+-------------+------+----------+---------+ -| scope | type | required | default | -+=============+======+==========+=========+ -| scope-level | list | no | *none* | -+-------------+------+----------+---------+ +~~~~~~~~~~~ ++------+----------+---------+ +| type | required | default | ++======+==========+=========+ +| list | no | *none* | ++------+----------+---------+ ``blocks`` ------------ -+-------------+------+----------+---------+ -| scope | type | required | default | -+=============+======+==========+=========+ -| scope-level | list | no | *none* | -+-------------+------+----------+---------+ +~~~~~~~~~~~ ++------+----------+---------+ +| type | required | default | ++======+==========+=========+ +| list | no | *none* | ++------+----------+---------+ +Scope rule settings +------------------- + +``domain`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + +``substring`` +~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + +``regex`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + +``ssurt`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + +``surt`` +~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + +``parent_url_regex`` +~~~~~~~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ + From 6df2c1cf2212362c39c8437d42471c6d1fa133f0 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 19 Mar 2018 16:54:17 -0700 Subject: [PATCH 02/24] WIP some explanation of automatic login --- job-conf.rst | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index 670b476..e5f79db 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -40,12 +40,11 @@ Example How inheritance works ===================== -Most of the available options apply to seeds. Such options can also be -specified at the top level, in which case the seeds inherit the options. If -an option is specified both at the top level and at the level of an individual -seed, the results are merged with the seed-level value taking precedence in -case of conflicts. It's probably easiest to make sense of this by way of an -example. +Most of the settings that apply to seeds can also be specified at the top +level, in which case all seeds inherit those settings. If an option is +specified both at the top level and at the level of an individual seed, the +results are merged with the seed-level value taking precedence in case of +conflicts. It's probably easiest to make sense of this by way of an example. In the example yaml above, ``warcprox_meta`` is specified at the top level and at the seed level for the seed http://one.example.org/. At the top level we @@ -117,7 +116,7 @@ seeds starving out other jobs. +------------------------+----------+---------+ List of seeds. Each item in the list is a dictionary (associative array) which defines the seed. It must specify ``url`` (see below) and can additionally -specify any *seed* settings. +specify any seed settings. Seed-level-only settings ------------------------ @@ -131,7 +130,7 @@ settings, which can also be specified at the top level. +========+==========+=========+ | string | yes | *n/a* | +--------+----------+---------+ -The seed url. +The seed url. Crawling starts here. ``username`` ~~~~~~~~~~~~ @@ -140,6 +139,8 @@ The seed url. +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +If set, used to populate automatically detected login forms. See explanation at +"password" below. ``password`` ~~~~~~~~~~~~ @@ -148,6 +149,14 @@ The seed url. +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +If set, used to populate automatically detected login forms. If ``username`` +and ``password`` are configured for a seed, brozzler will look for a login form +on each page it crawls for that seed. A form that has a single text or email +field (the username), a single password field (````), +and has ``method="POST"`` is considered to be a login form. The form may have +other fields like checkboxes and hidden fields. For these, brozzler will leave +the default values in place. Login form detection and submission happen after +page load, then brozzling proceeds as usual. Seed-level / top-level settings ------------------------------- From 88214236bb1642de355b5730dc013ce192b83db5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 19 Mar 2018 17:23:49 -0700 Subject: [PATCH 03/24] WIP starting to flesh out "scoping" section --- job-conf.rst | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index e5f79db..756c232 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -251,12 +251,26 @@ becomes:: +============+==========+===========+ | dictionary | no | ``false`` | +------------+----------+-----------+ -Scope rules. *TODO* +Scope specificaion for the seed. See the "Scoping" section which follows. Scoping ======= -*TODO* explanation of scoping and scope rules +The scope of a seed determines which links are scheduled for crawling and which +are not. Example:: + + scope: + accepts: + - parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$ + regex: ^https?://(www\.)?youtube.com/watch\?.*$ + - surt: +http://(com,google,video, + - surt: +http://(com,googlevideo, + blocks: + - domain: youngscholars.unimelb.edu.au + substring: wp-login.php?action=logout + - domain: malware.us + max_hops: 20 + max_hops_off_surt: 0 Scope settings -------------- @@ -285,6 +299,21 @@ Scope settings | list | no | *none* | +------+----------+---------+ +``max_hops`` +~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | *none* | ++--------+----------+---------+ + +``max_hops_off_surt`` +~~~~~~~~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | 0 | ++--------+----------+---------+ Scope rule settings ------------------- From 98ce67ef363c99b5e990d435518b477dc0fe0ca4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 20 Mar 2018 17:31:42 -0700 Subject: [PATCH 04/24] WIP some words on scoping --- job-conf.rst | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index 756c232..4a0dbf5 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -80,8 +80,8 @@ Notice that: - Since ``buckets`` is a list, the merged result includes all the values from both the top level and the seed level. -Settings reference -================== +Settings +======== Top-level settings ------------------ @@ -260,6 +260,7 @@ The scope of a seed determines which links are scheduled for crawling and which are not. Example:: scope: + surt: https://(com,example,)/ accepts: - parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$ regex: ^https?://(www\.)?youtube.com/watch\?.*$ @@ -272,6 +273,12 @@ are not. Example:: max_hops: 20 max_hops_off_surt: 0 +Toward the end of the process of brozzling a page, brozzler obtains a list of +navigational links (```` and similar) on the page, and evaluates +each link to determine whether it is in scope or out of scope for the crawl. +Then, newly discovered links that are in scope are scheduled to be crawled, and +previously discovered links get a priority bump. + Scope settings -------------- @@ -282,6 +289,47 @@ Scope settings +========+==========+===========================+ | string | no | *generated from seed url* | +--------+----------+---------------------------+ +This setting can be thought of as the fundamental scope setting for the seed. +Every seed has a ``scope.surt``. Brozzler will generate it from the seed url if +it is not specified explicitly. + +SURT is defined at +http://crawler.archive.org/articles/user_manual/glossary.html#surt. + + SURT stands for Sort-friendly URI Reordering Transform, and is a + transformation applied to URIs which makes their left-to-right + representation better match the natural hierarchy of domain names. + +Brozzler generates ``surt`` if not specified by canonicalizing the seed url +using the `urlcanon `_ library's "semantic" +canonicalizer, then removing the query string if any, and finally serializing +the result in SURT form. For example, a seed url of +``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes +``https://(com,example,www,)/foo/bar``. + +If the url in the browser location bar at the end of brozzling the seed page +differs from the seed url, brozzler automatically adds an "accept" rule to +ensure the site is in scope, as if the new url were the original seed url. +It does this so that, for example, if ``http://example.com/`` redirects to +``http://www.example.com/``, the rest of the ``www.example.com`` will also be +in scope. + +Brozzler derives its general approach to the seed surt from Heritrix, but +differs in a few respects. + +1. Unlike heritrix, brozzler does not strip the path segment after the last + slash. +2. Canonicalization does not attempt to match heritrix exactly, though it + usually will match. +3. When generating a SURT for an https url, heritrix changes the scheme to + http. For example, the heritrix surt for ``https://www.example.com/`` is + ``http://(com,example,www,)`` and this means that all of + ``http://www.example.com/*`` and ``https://www.example.com/*`` will be in + scope. It also means that a manually specified surt with scheme https will + not match anything. Brozzler does no scheme munging. +4. Brozzler identifies seed "redirects" by retrieving the url from the + browser's location bar at the end of brozzling the seed page, whereas + heritrix follows http redirects. ``accepts`` ~~~~~~~~~~~ @@ -290,6 +338,7 @@ Scope settings +======+==========+=========+ | list | no | *none* | +------+----------+---------+ +List of scope rules. ``blocks`` ~~~~~~~~~~~ From f26712ce93ff63592891c8ece3bee21798515084 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 20 Mar 2018 17:32:01 -0700 Subject: [PATCH 05/24] WIP add an accept rule instead of modifying surt in place for seed redirects --- brozzler/model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 9c1a60f..96c4ca7 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -203,10 +203,12 @@ class Site(doublethink.Document, ElapsedMixIn): def note_seed_redirect(self, url): new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii") + if not "accepts" in self.scope: + self.scope["accepts"] = [] if not new_scope_surt.startswith(self.scope["surt"]): - self.logger.info("changing site scope surt from {} to {}".format( - self.scope["surt"], new_scope_surt)) - self.scope["surt"] = new_scope_surt + self.logger.info( + "adding surt %s to scope accept rules", new_scope_surt) + self.scope.accepts.append({"surt": new_scope_surt}) def extra_headers(self): hdrs = {} From 245e27a21aad9c3b95873ac414de522a1ca2e7c4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 13:33:34 -0700 Subject: [PATCH 06/24] tests for new approach without of scope['surt'] replaced by an accept rule (two rules in some cases of seed redirects) --- tests/test_frontier.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 9b80a4f..669f0ec 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -73,9 +73,7 @@ def test_basics(): 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': { - 'surt': 'http://(com,example,)/' - }, + 'scope': {'accepts': [{'ssurt': b'com,example,//http:/'}]}, 'seed': 'http://example.com', 'starts_and_stops': [ { @@ -91,9 +89,7 @@ def test_basics(): 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': { - 'surt': 'https://(org,example,)/', - }, + 'scope': {'accepts': [{'ssurt': b'org,example,//https:/'}]}, 'seed': 'https://example.org/', 'starts_and_stops': [ { @@ -443,8 +439,7 @@ def test_field_defaults(): brozzler.Site.table_ensure(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/'}) assert site.id is None - assert site.scope - assert site.scope['surt'] == 'http://(com,example,)/' + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/'}]} site.save() assert site.id assert site.scope @@ -638,11 +633,15 @@ def test_completed_page(): 'hops_from_seed': 0, 'redirect_url':'http://example.com/b/', }) page.save() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'surt': 'http://(com,example,)/b/'} + assert site.scope == {'accepts': [ + {'ssurt': b'com,example,//http:/a/'}, + {'ssurt': b'com,example,//http:/b/'}]} site.refresh() - assert site.scope == {'surt': 'http://(com,example,)/b/'} + assert site.scope == {'accepts': [ + {'ssurt': b'com,example,//http:/a/'}, + {'ssurt': b'com,example,//http:/b/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -661,11 +660,11 @@ def test_completed_page(): 'hops_from_seed': 0, 'redirect_url':'http://example.com/a/x/', }) page.save() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} site.refresh() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -683,11 +682,11 @@ def test_completed_page(): 'hops_from_seed': 1, 'redirect_url':'http://example.com/d/', }) page.save() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} site.refresh() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -727,7 +726,7 @@ def test_hashtag_seed(): site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) - assert site.scope['surt'] == 'http://(org,example,)/' + assert site.scope == {'accepts': [{'ssurt': b'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 @@ -738,7 +737,7 @@ def test_hashtag_seed(): site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) brozzler.new_site(frontier, site) - assert site.scope['surt'] == 'http://(org,example,)/' + assert site.scope == {'accepts': [{'ssurt': b'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 From 526a4d718fb0d7cca2c9152a992d97f58226a8d3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 13:39:29 -0700 Subject: [PATCH 07/24] tests for new approach without scope['surt'] replaced by an accept rule (two rules in some cases of seed redirects) --- brozzler/frontier.py | 89 +++++++++++++++++++++++--------------------- brozzler/model.py | 89 ++++++++++++++++++++++++-------------------- 2 files changed, 96 insertions(+), 82 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index a0f8ab4..5276f72 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -291,75 +291,80 @@ class RethinkDbFrontier: {"start":doublethink.utcnow(), "stop":None}) site.save() + def _build_fresh_page(self, site, parent_page, url, hops_off=0): + url_for_scoping = urlcanon.semantic(url) + url_for_crawling = urlcanon.whatwg(url) + hashtag = (url_for_crawling.hash_sign + + url_for_crawling.fragment).decode('utf-8') + urlcanon.canon.remove_fragment(url_for_crawling) + page = brozzler.Page(self.rr, { + 'url': str(url_for_crawling), + 'site_id': site.id, + 'job_id': site.job_id, + 'hops_from_seed': parent_page.hops_from_seed + 1, + 'via_page_id': parent_page.id, + 'hops_off_surt': hops_off, + 'hashtags': [hashtag] if hashtag else []}) + return page + + def _merge_page(self, existing_page, fresh_page): + ''' + Utility method for merging info from `brozzler.Page` instances + representing the same url but with possibly different metadata. + ''' + existing_page.priority += fresh_page.priority + existing_page.hashtags = list(set( + existing_page.hashtags + fresh_page.hashtags)) + existing_page.hops_off_surt = min( + existing_page.hops_off_surt, fresh_page.hops_off_surt) + def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( - set of in scope urls (uncanonicalized) accepted by robots policy, + dict of {page_id: Page} of fresh `brozzler.Page` representing in + scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' - in_scope = set() + pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) - urlcanon.canon.remove_fragment(url_for_crawling) - if site.is_in_scope(url_for_scoping, parent_page=parent_page): + decision = site.accept_reject_or_neither( + url_for_scoping, parent_page=parent_page) + if decision is True: + hops_off = 0 + elif decision is None: + decision = parent_page.hops_off_surt < site.scope.get( + 'max_hops_off_surt', 0) + hops_off = parent_page.hops_off_surt + 1 + if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): - in_scope.add(url) + fresh_page = self._build_fresh_page( + site, parent_page, url, hops_off) + if fresh_page.id in pages: + self._merge_page(pages[fresh_page.id], fresh_page) + else: + pages[fresh_page.id] = fresh_page else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) - return in_scope, blocked, out_of_scope - - def _build_fresh_pages(self, site, parent_page, urls): - ''' - Returns a dict of page_id => brozzler.Page. - ''' - pages = {} - for url in urls: - url_for_scoping = urlcanon.semantic(url) - url_for_crawling = urlcanon.whatwg(url) - hashtag = (url_for_crawling.hash_sign - + url_for_crawling.fragment).decode('utf-8') - urlcanon.canon.remove_fragment(url_for_crawling) - if not url_for_scoping.surt().startswith( - site.scope['surt'].encode('utf-8')): - hops_off_surt = parent_page.hops_off_surt + 1 - else: - hops_off_surt = 0 - page = brozzler.Page(self.rr, { - 'url': str(url_for_crawling), - 'site_id': site.id, - 'job_id': site.job_id, - 'hops_from_seed': parent_page.hops_from_seed + 1, - 'via_page_id': parent_page.id, - 'hops_off_surt': hops_off_surt, - 'hashtags': []}) - if page.id in pages: - pages[page.id].priority += page.priority - page = pages[page.id] - else: - pages[page.id] = page - if hashtag: - page.hashtags = list(set(page.hashtags + [hashtag])) - return pages + return pages, blocked, out_of_scope def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {'accepted':set(),'blocked':set(),'rejected':set()} counts = {'added':0,'updated':0,'rejected':0,'blocked':0} - in_scope, blocked, out_of_scope = self._scope_and_enforce_robots( + fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots( site, parent_page, outlinks) decisions['blocked'] = blocked decisions['rejected'] = out_of_scope counts['blocked'] += len(blocked) counts['rejected'] += len(out_of_scope) - fresh_pages = self._build_fresh_pages(site, parent_page, in_scope) - # get existing pages from rethinkdb results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} diff --git a/brozzler/model.py b/brozzler/model.py index 96c4ca7..6b35bc2 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -183,9 +183,9 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} - if not "surt" in self.scope and self.seed: - self.scope["surt"] = brozzler.site_surt_canon( - self.seed).surt().decode('ascii') + if self.seed: + self._accept_ssurt_if_not_redundant( + brozzler.site_surt_canon(self.seed).ssurt()) if not "starts_and_stops" in self: if self.get("start_time"): # backward compatibility @@ -201,14 +201,20 @@ class Site(doublethink.Document, ElapsedMixIn): def __str__(self): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) - def note_seed_redirect(self, url): - new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii") + def _accept_ssurt_if_not_redundant(self, ssurt): if not "accepts" in self.scope: self.scope["accepts"] = [] - if not new_scope_surt.startswith(self.scope["surt"]): + simple_rule_ssurts = ( + rule["ssurt"] for rule in self.scope["accepts"] + if set(rule.keys()) == {'ssurt'}) + if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts): self.logger.info( - "adding surt %s to scope accept rules", new_scope_surt) - self.scope.accepts.append({"surt": new_scope_surt}) + "adding ssurt %s to scope accept rules", ssurt) + self.scope["accepts"].append({"ssurt": ssurt}) + + def note_seed_redirect(self, url): + self._accept_ssurt_if_not_redundant( + brozzler.site_surt_canon(url).ssurt()) def extra_headers(self): hdrs = {} @@ -217,9 +223,20 @@ class Site(doublethink.Document, ElapsedMixIn): self.warcprox_meta, separators=(',', ':')) return hdrs - def is_in_scope(self, url, parent_page=None): + def accept_reject_or_neither(self, url, parent_page=None): + ''' + Returns `True` (accepted), `False` (rejected), or `None` (no decision). + + `None` usually means rejected, unless `max_hops_off` comes into play. + ''' if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) + + if not url.scheme in (b'http', b'https'): + # XXX doesn't belong here maybe (where? worker ignores unknown + # schemes?) + return False + try_parent_urls = [] if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) @@ -227,44 +244,36 @@ class Site(doublethink.Document, ElapsedMixIn): try_parent_urls.append( urlcanon.semantic(parent_page.redirect_url)) - might_accept = False - if not url.scheme in (b'http', b'https'): - # XXX doesn't belong here maybe (where? worker ignores unknown - # schemes?) - return False - elif (parent_page and "max_hops" in self.scope + # enforce max_hops + if (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): - pass - elif url.surt().startswith(self.scope["surt"].encode("utf-8")): - might_accept = True - elif parent_page and parent_page.hops_off_surt < self.scope.get( - "max_hops_off_surt", 0): - might_accept = True - elif "accepts" in self.scope: - for accept_rule in self.scope["accepts"]: - rule = urlcanon.MatchRule(**accept_rule) + return False + + # enforce reject rules + if "blocks" in self.scope: + for block_rule in self.scope["blocks"]: + rule = urlcanon.MatchRule(**block_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): - might_accept = True + return False else: if rule.applies(url): - might_accept = True + return False - if might_accept: - if "blocks" in self.scope: - for block_rule in self.scope["blocks"]: - rule = urlcanon.MatchRule(**block_rule) - if try_parent_urls: - for parent_url in try_parent_urls: - if rule.applies(url, parent_url): - return False - else: - if rule.applies(url): - return False - return True - else: - return False + # honor accept rules + for accept_rule in self.scope["accepts"]: + rule = urlcanon.MatchRule(**accept_rule) + if try_parent_urls: + for parent_url in try_parent_urls: + if rule.applies(url, parent_url): + return True + else: + if rule.applies(url): + return True + + # no decision if we reach here + return None class Page(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) From 60f2b99cc0c7eb0d6593d59e1d85487e8b030c73 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 17:06:46 -0700 Subject: [PATCH 08/24] doublethink had a bug fix --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d5527c4..e6ce051 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - brozzler setup script -Copyright (C) 2014-2017 Internet Archive +Copyright (C) 2014-2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -70,7 +70,7 @@ setuptools.setup( 'websocket-client!=0.39.0', 'pillow==3.3.0', 'urlcanon>=0.1.dev16', - 'doublethink>=0.2.0.dev81', + 'doublethink>=0.2.0.dev88', 'rethinkdb>=2.3,<2.4', 'cerberus==1.0.1', 'jinja2', From b83d3cb9df08099952c83ccd66cd85c7670c64c4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 17:07:52 -0700 Subject: [PATCH 09/24] rename page.hops_off_surt to page.hops_off --- brozzler/frontier.py | 8 ++++---- brozzler/model.py | 10 +++++++--- tests/test_frontier.py | 6 +++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 5276f72..b9785bc 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -315,8 +315,8 @@ class RethinkDbFrontier: existing_page.priority += fresh_page.priority existing_page.hashtags = list(set( existing_page.hashtags + fresh_page.hashtags)) - existing_page.hops_off_surt = min( - existing_page.hops_off_surt, fresh_page.hops_off_surt) + existing_page.hops_off = min( + existing_page.hops_off, fresh_page.hops_off) def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' @@ -337,9 +337,9 @@ class RethinkDbFrontier: if decision is True: hops_off = 0 elif decision is None: - decision = parent_page.hops_off_surt < site.scope.get( + decision = parent_page.hops_off < site.scope.get( 'max_hops_off_surt', 0) - hops_off = parent_page.hops_off_surt + 1 + hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): fresh_page = self._build_fresh_page( diff --git a/brozzler/model.py b/brozzler/model.py index 6b35bc2..9c6f482 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -99,7 +99,7 @@ def new_job(frontier, job_conf): def new_site(frontier, site): site.id = str(uuid.uuid4()) - logging.info("new site {}".format(site)) + logging.info("new site %s", site) # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished @@ -291,8 +291,12 @@ class Page(doublethink.Document): self.brozzle_count = 0 if not "claimed" in self: self.claimed = False - if not "hops_off_surt" in self: - self.hops_off_surt = 0 + if "hops_off_surt" in self and not "hops_off" in self: + self.hops_off = self.hops_off_surt + if "hops_off_surt" in self: + del self["hops_off_surt"] + if not "hops_off" in self: + self.hops_off = 0 if not "needs_robots_check" in self: self.needs_robots_check = False if not "priority" in self: diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 669f0ec..97c4e83 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -106,7 +106,7 @@ def test_basics(): 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, - 'hops_off_surt': 0, + 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), 'job_id': job.id, 'needs_robots_check': True, @@ -120,7 +120,7 @@ def test_basics(): 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, - 'hops_off_surt': 0, + 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), 'job_id': job.id, 'needs_robots_check': True, @@ -907,7 +907,7 @@ def test_choose_warcprox(): svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) - # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 + # avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run() From 5ebd2fb7090bf63676feb5765221f1f4d6cf40ca Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 17:09:32 -0700 Subject: [PATCH 10/24] new test of max_hops_off --- tests/test_frontier.py | 119 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 97c4e83..c075edb 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -977,3 +977,122 @@ def test_choose_warcprox(): # clean up rr.table('sites').delete().run() rr.table('services').delete().run() + +def test_max_hops_off(): + rr = doublethink.Rethinker('localhost', db='ignoreme') + frontier = brozzler.RethinkDbFrontier(rr) + site = brozzler.Site(rr, { + 'seed': 'http://example.com/', + 'scope': { + 'max_hops_off_surt': 1, + 'blocks': [{'ssurt': b'domain,bad,'}]}}) + brozzler.new_site(frontier, site) + site.refresh() # get it back from the db + + seed_page = frontier.seed_page(site.id) + + assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None + assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None + assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True + assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False + + # two of these are in scope because of max_hops_off + frontier.scope_and_schedule_outlinks(site, seed_page, [ + 'http://foo.org/', 'https://example.com/toot', + 'http://example.com/toot', 'https://some.bad.domain/something']) + + pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) + + assert len(pages) == 4 + assert pages[0].url == 'http://example.com/' + assert pages[0].hops_off == 0 + assert not 'hops_off_surt' in pages[0] + assert set(pages[0].outlinks['accepted']) == { + 'https://example.com/toot', 'http://foo.org/', + 'http://example.com/toot'} + assert pages[0].outlinks['blocked'] == [] + assert pages[0].outlinks['rejected'] == [ + 'https://some.bad.domain/something'] + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 0, + 'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'http://example.com/toot', + 'via_page_id': seed_page.id + } in pages + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 1, + 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'http://foo.org/', + 'via_page_id': seed_page.id + } in pages + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 1, + 'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'https://example.com/toot', + 'via_page_id': seed_page.id + } in pages + + # next hop is past max_hops_off, but normal in scope url is in scope + foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] + frontier.scope_and_schedule_outlinks(site, foo_page, [ + 'http://foo.org/bar', 'http://example.com/blah']) + assert foo_page == { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 1, + 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'http://foo.org/', + 'via_page_id': seed_page.id, + 'outlinks': { + 'accepted': ['http://example.com/blah'], + 'blocked': [], + 'rejected': ['http://foo.org/bar'], + } + } + pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) + assert len(pages) == 5 + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 2, + 'hops_off': 0, + 'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 11, + 'site_id': site.id, + 'url': 'http://example.com/blah', + 'via_page_id': foo_page.id + } in pages + From 85a475752798db82c3b76a8b59cca4543bb95000 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 17:18:36 -0700 Subject: [PATCH 11/24] s/max_hops_off_surt/max_hops_off/ --- brozzler/frontier.py | 2 +- brozzler/job_schema.yaml | 2 +- brozzler/model.py | 5 +++++ job-conf.rst | 4 ++-- tests/test_frontier.py | 4 ++++ 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index b9785bc..2e076d3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -338,7 +338,7 @@ class RethinkDbFrontier: hops_off = 0 elif decision is None: decision = parent_page.hops_off < site.scope.get( - 'max_hops_off_surt', 0) + 'max_hops_off', 0) hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 14445bc..6069de8 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -65,7 +65,7 @@ id: max_hops: type: integer - max_hops_off_surt: + max_hops_off: type: integer metadata: diff --git a/brozzler/model.py b/brozzler/model.py index 9c6f482..5e787dc 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -183,6 +183,11 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} + if ("max_hops_off_surt" in self.scope + and not "max_hops_off" in self.scope): + self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] + if "max_hops_off_surt" in self.scope: + del self.scope["max_hops_off_surt"] if self.seed: self._accept_ssurt_if_not_redundant( brozzler.site_surt_canon(self.seed).ssurt()) diff --git a/job-conf.rst b/job-conf.rst index 4a0dbf5..22c6992 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -271,7 +271,7 @@ are not. Example:: substring: wp-login.php?action=logout - domain: malware.us max_hops: 20 - max_hops_off_surt: 0 + max_hops_off: 0 Toward the end of the process of brozzling a page, brozzler obtains a list of navigational links (```` and similar) on the page, and evaluates @@ -356,7 +356,7 @@ List of scope rules. | number | no | *none* | +--------+----------+---------+ -``max_hops_off_surt`` +``max_hops_off`` ~~~~~~~~~~~~~~~~~~~~~ +--------+----------+---------+ | type | required | default | diff --git a/tests/test_frontier.py b/tests/test_frontier.py index c075edb..4906919 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -989,6 +989,10 @@ def test_max_hops_off(): brozzler.new_site(frontier, site) site.refresh() # get it back from the db + # renamed this param + assert not 'max_hops_off_surt' in site.scope + assert site.scope['max_hops_off'] == 1 + seed_page = frontier.seed_page(site.id) assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None From 05f8ab3495268d55041fc3909ad516561fb68ea8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 23 Mar 2018 10:43:08 -0700 Subject: [PATCH 12/24] fix more tests for new approach sans scope['surt'] --- tests/test_cluster.py | 9 ++++++--- tests/test_units.py | 28 ++++++++++++++-------------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 2559e07..d8066b9 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -448,13 +448,14 @@ def test_login(httpd): assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url def test_seed_redirect(httpd): - test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() + test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) - assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port + assert site.scope == { + 'accepts': [{'ssurt': b'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]} frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -478,7 +479,9 @@ def test_seed_redirect(httpd): assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port # check that scope has been updated properly - assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port + assert site.scope == {'accepts': [ + {'ssurt': b'localhost,//%s:http:/site5/redirect/' % httpd.server_port}, + {'ssurt': b'localhost,//%s:http:/site5/destination/' % httpd.server_port}]} def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() diff --git a/tests/test_units.py b/tests/test_units.py index ce5067c..eed034e 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -94,28 +94,28 @@ blocks: 'url': 'http://example.com/foo/bar?baz=quux#monkey', 'site_id': site.id}) - assert site.is_in_scope('http://example.com/foo/bar', page) - assert not site.is_in_scope('http://example.com/foo/baz', page) + assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True + assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None - assert not site.is_in_scope('http://foo.com/some.mp3', page) - assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page) + assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None + assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True - assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page) - assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page) + assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True + assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None - assert site.is_in_scope('https://twitter.com/twit', page) - assert site.is_in_scope('https://twitter.com/twit?lang=en', page) - assert not site.is_in_scope('https://twitter.com/twit?lang=es', page) + assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True + assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True + assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False - assert site.is_in_scope('https://www.facebook.com/whatevz', page) + assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True - assert not site.is_in_scope( - 'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) + assert site.accept_reject_or_neither( + 'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None yt_user_page = brozzler.Page(None, { 'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO', 'site_id': site.id, 'hops_from_seed': 10}) - assert site.is_in_scope( - 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) + assert site.accept_reject_or_neither( + 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True def test_proxy_down(): ''' From fc05cac338e62c81824efb4737c0db1fabb80cc2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 23 Mar 2018 11:43:14 -0700 Subject: [PATCH 13/24] ok seriously tests --- tests/test_cluster.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index d8066b9..41afbcb 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -454,8 +454,7 @@ def test_seed_redirect(httpd): site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) - assert site.scope == { - 'accepts': [{'ssurt': b'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]} + assert site.scope == {'accepts': [{'ssurt': ('localhost,//%s:http:/site5/redirect/' % httpd.server_port).encode('ascii')}]} frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -480,8 +479,8 @@ def test_seed_redirect(httpd): # check that scope has been updated properly assert site.scope == {'accepts': [ - {'ssurt': b'localhost,//%s:http:/site5/redirect/' % httpd.server_port}, - {'ssurt': b'localhost,//%s:http:/site5/destination/' % httpd.server_port}]} + {'ssurt': ('localhost,//%s:http:/site5/redirect/' % httpd.server_port).encode('ascii')}, + {'ssurt': ('localhost,//%s:http:/site5/destination/' % httpd.server_port).encode('ascii')}]} def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() From 2cf474aa1d25edb1cd9dda03913430343cb304a9 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 14 May 2018 16:59:55 -0700 Subject: [PATCH 14/24] update docs to match new seed ssurt behavior --- job-conf.rst | 65 ++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index 22c6992..f2752c6 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -260,12 +260,12 @@ The scope of a seed determines which links are scheduled for crawling and which are not. Example:: scope: - surt: https://(com,example,)/ accepts: - parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$ regex: ^https?://(www\.)?youtube.com/watch\?.*$ - - surt: +http://(com,google,video, - - surt: +http://(com,googlevideo, + - ssurt: com,example,//https:/ + - surt: http://(com,google,video, + - surt: http://(com,googlevideo, blocks: - domain: youngscholars.unimelb.edu.au substring: wp-login.php?action=logout @@ -279,40 +279,33 @@ each link to determine whether it is in scope or out of scope for the crawl. Then, newly discovered links that are in scope are scheduled to be crawled, and previously discovered links get a priority bump. -Scope settings --------------- +How scope rules are applied +--------------------------- +1. If any ``block`` rule matches, the url is out of scope. +2. Otherwise, if any ``accept`` rule matches, the url is in scope. +3. Otherwise (no rules match), the url is out of scope. -``surt`` -~~~~~~~~ -+--------+----------+---------------------------+ -| type | required | default | -+========+==========+===========================+ -| string | no | *generated from seed url* | -+--------+----------+---------------------------+ -This setting can be thought of as the fundamental scope setting for the seed. -Every seed has a ``scope.surt``. Brozzler will generate it from the seed url if -it is not specified explicitly. +In other words, by default urls are not in scope, and ``block`` rules take +precedence over ``accept`` rules. -SURT is defined at -http://crawler.archive.org/articles/user_manual/glossary.html#surt. +Automatic scoping based on seed urls +------------------------------------ +Brozzler usually generates an ``accept`` scope rule based on the seed url. It +does this to fulfill a crawl operator's expectation that everything "under" the +seed will be crawled. - SURT stands for Sort-friendly URI Reordering Transform, and is a - transformation applied to URIs which makes their left-to-right - representation better match the natural hierarchy of domain names. - -Brozzler generates ``surt`` if not specified by canonicalizing the seed url -using the `urlcanon `_ library's "semantic" -canonicalizer, then removing the query string if any, and finally serializing -the result in SURT form. For example, a seed url of +To generate the rule, brozzler canonicalizes the seed url using the `urlcanon +`_ library's "semantic" canonicalizer, then +removing the query string if any, and finally serializing the result in SURT +form. For example, a seed url of ``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes -``https://(com,example,www,)/foo/bar``. +``com,example,www,//https:/foo/bar?a=b&c=d``. If the url in the browser location bar at the end of brozzling the seed page -differs from the seed url, brozzler automatically adds an "accept" rule to -ensure the site is in scope, as if the new url were the original seed url. +differs from the seed url, brozzler automatically adds a second "accept" rule +to ensure the site is in scope, as if the new url were the original seed url. It does this so that, for example, if ``http://example.com/`` redirects to -``http://www.example.com/``, the rest of the ``www.example.com`` will also be -in scope. +``http://www.example.com/``, the rest of the ``www.example.com`` is in scope. Brozzler derives its general approach to the seed surt from Heritrix, but differs in a few respects. @@ -320,16 +313,22 @@ differs in a few respects. 1. Unlike heritrix, brozzler does not strip the path segment after the last slash. 2. Canonicalization does not attempt to match heritrix exactly, though it - usually will match. + usually does match. 3. When generating a SURT for an https url, heritrix changes the scheme to http. For example, the heritrix surt for ``https://www.example.com/`` is ``http://(com,example,www,)`` and this means that all of - ``http://www.example.com/*`` and ``https://www.example.com/*`` will be in - scope. It also means that a manually specified surt with scheme https will + ``http://www.example.com/*`` and ``https://www.example.com/*`` are in + scope. It also means that a manually specified surt with scheme "https" does not match anything. Brozzler does no scheme munging. 4. Brozzler identifies seed "redirects" by retrieving the url from the browser's location bar at the end of brozzling the seed page, whereas heritrix follows http redirects. +5. Brozzler uses ssurt instead of surt. +6. There is currently no brozzler option to disable the automatic ``accept`` + surt(s). + +Scope settings +-------------- ``accepts`` ~~~~~~~~~~~ From a327cb626fe66cdc5c6880e0eada2b2f9bc8feaa Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 14 May 2018 17:31:45 -0700 Subject: [PATCH 15/24] more explication of scoping --- job-conf.rst | 52 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index f2752c6..4df44eb 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -261,9 +261,9 @@ are not. Example:: scope: accepts: + - ssurt: com,example,//https:/ - parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$ regex: ^https?://(www\.)?youtube.com/watch\?.*$ - - ssurt: com,example,//https:/ - surt: http://(com,google,video, - surt: http://(com,googlevideo, blocks: @@ -279,14 +279,43 @@ each link to determine whether it is in scope or out of scope for the crawl. Then, newly discovered links that are in scope are scheduled to be crawled, and previously discovered links get a priority bump. -How scope rules are applied ---------------------------- -1. If any ``block`` rule matches, the url is out of scope. -2. Otherwise, if any ``accept`` rule matches, the url is in scope. -3. Otherwise (no rules match), the url is out of scope. +Applying scope rules +-------------------- -In other words, by default urls are not in scope, and ``block`` rules take -precedence over ``accept`` rules. +Each scope rule has one or more conditions. If all of the conditions match, +then the scope rule as a whole matches. For example:: + + - domain: youngscholars.unimelb.edu.au + substring: wp-login.php?action=logout + +This rule applies if the domain of the url is "youngscholars.unimelb.edu.au" or +a subdomain, and the string "wp-login.php?action=logout" is found somewhere in +the url. + +Brozzler applies these logical steps to decide whether a page url is in or out +of scope: + +1. If the number of hops from seed is greater than ``max_hops``, the url is + **out of scope**. +2. Otherwise, if any ``block`` rule matches, the url is **out of scope**. +3. Otherwise, if any ``accept`` rule matches, the url is **in scope**. +4. Otherwise, if the url is at most ``max_hops_off`` hops from the last page + that was in scope thanks to an ``accept`` rule, the url is **in scope**. +5. Otherwise (no rules match), the url is **out of scope**. + +Notably, ``block`` rules take precedence over ``accept`` rules. + +It may also be helpful to think about a list of scope rules as a boolean +expression. For example:: + + blocks: + - domain: youngscholars.unimelb.edu.au + substring: wp-login.php?action=logout + - domain: malware.us + +means block the url IF:: + + (domain: youngscholars.unimelb.edu.au AND substring: wp-login.php?action=logout) OR domain: malware.us Automatic scoping based on seed urls ------------------------------------ @@ -346,6 +375,7 @@ List of scope rules. +======+==========+=========+ | list | no | *none* | +------+----------+---------+ +List of scope rules. ``max_hops`` ~~~~~~~~~~~~ @@ -356,15 +386,15 @@ List of scope rules. +--------+----------+---------+ ``max_hops_off`` -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~ +--------+----------+---------+ | type | required | default | +========+==========+=========+ | number | no | 0 | +--------+----------+---------+ -Scope rule settings -------------------- +Scope rule conditions +--------------------- ``domain`` ~~~~~~~~~ From de1f240e25f52be8449a5c6082d34a6d7f2dd26b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 15 May 2018 11:01:09 -0700 Subject: [PATCH 16/24] describe scope rule conditions plus a bunch of tweaks and fixes --- job-conf.rst | 61 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index 4df44eb..1fa5bc6 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -42,9 +42,9 @@ How inheritance works Most of the settings that apply to seeds can also be specified at the top level, in which case all seeds inherit those settings. If an option is -specified both at the top level and at the level of an individual seed, the -results are merged with the seed-level value taking precedence in case of -conflicts. It's probably easiest to make sense of this by way of an example. +specified both at the top level and at seed level, the results are merged with +the seed-level value taking precedence in case of conflicts. It's probably +easiest to make sense of this by way of an example. In the example yaml above, ``warcprox_meta`` is specified at the top level and at the seed level for the seed http://one.example.org/. At the top level we @@ -155,8 +155,8 @@ on each page it crawls for that seed. A form that has a single text or email field (the username), a single password field (````), and has ``method="POST"`` is considered to be a login form. The form may have other fields like checkboxes and hidden fields. For these, brozzler will leave -the default values in place. Login form detection and submission happen after -page load, then brozzling proceeds as usual. +the default values in place. Brozzler submits login forms after page load. +Then brozzling proceeds as usual. Seed-level / top-level settings ------------------------------- @@ -168,7 +168,7 @@ case they are inherited by all seeds. +------------+----------+---------+ | type | required | default | +============+==========+=========+ -| dictionary | no | *none* | +| dictionary | no | *none* | +------------+----------+---------+ Arbitrary information about the crawl job or site. Merely informative, not used by brozzler for anything. Could be of use to some external process. @@ -180,7 +180,7 @@ by brozzler for anything. Could be of use to some external process. +========+==========+=========+ | number | no | *none* | +--------+----------+---------+ -Time limit in seconds. If not specified, there no time limit. Time limit is +Time limit in seconds. If not specified, there is no time limit. Time limit is enforced at the seed level. If a time limit is specified at the top level, it is inherited by each seed as described above, and enforced individually on each seed. @@ -279,8 +279,8 @@ each link to determine whether it is in scope or out of scope for the crawl. Then, newly discovered links that are in scope are scheduled to be crawled, and previously discovered links get a priority bump. -Applying scope rules --------------------- +How brozzler applies scope rules +-------------------------------- Each scope rule has one or more conditions. If all of the conditions match, then the scope rule as a whole matches. For example:: @@ -292,8 +292,8 @@ This rule applies if the domain of the url is "youngscholars.unimelb.edu.au" or a subdomain, and the string "wp-login.php?action=logout" is found somewhere in the url. -Brozzler applies these logical steps to decide whether a page url is in or out -of scope: +Brozzler applies these logical steps to decide whether a url is in or out of +scope: 1. If the number of hops from seed is greater than ``max_hops``, the url is **out of scope**. @@ -315,23 +315,23 @@ expression. For example:: means block the url IF:: - (domain: youngscholars.unimelb.edu.au AND substring: wp-login.php?action=logout) OR domain: malware.us + ("domain: youngscholars.unimelb.edu.au" AND "substring: wp-login.php?action=logout") OR "domain: malware.us" Automatic scoping based on seed urls ------------------------------------ Brozzler usually generates an ``accept`` scope rule based on the seed url. It -does this to fulfill a crawl operator's expectation that everything "under" the -seed will be crawled. +does this to fulfill the usual expectation that everything "under" the seed +will be crawled. To generate the rule, brozzler canonicalizes the seed url using the `urlcanon `_ library's "semantic" canonicalizer, then -removing the query string if any, and finally serializing the result in SURT -form. For example, a seed url of +removes the query string if any, and finally serializes the result in SSURT +[1]_ form. For example, a seed url of ``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes ``com,example,www,//https:/foo/bar?a=b&c=d``. If the url in the browser location bar at the end of brozzling the seed page -differs from the seed url, brozzler automatically adds a second "accept" rule +differs from the seed url, brozzler automatically adds a second ``accept`` rule to ensure the site is in scope, as if the new url were the original seed url. It does this so that, for example, if ``http://example.com/`` redirects to ``http://www.example.com/``, the rest of the ``www.example.com`` is in scope. @@ -343,7 +343,7 @@ differs in a few respects. slash. 2. Canonicalization does not attempt to match heritrix exactly, though it usually does match. -3. When generating a SURT for an https url, heritrix changes the scheme to +3. When generating a surt for an https url, heritrix changes the scheme to http. For example, the heritrix surt for ``https://www.example.com/`` is ``http://(com,example,www,)`` and this means that all of ``http://www.example.com/*`` and ``https://www.example.com/*`` are in @@ -351,10 +351,10 @@ differs in a few respects. not match anything. Brozzler does no scheme munging. 4. Brozzler identifies seed "redirects" by retrieving the url from the browser's location bar at the end of brozzling the seed page, whereas - heritrix follows http redirects. + heritrix follows http 3xx redirects. 5. Brozzler uses ssurt instead of surt. -6. There is currently no brozzler option to disable the automatic ``accept`` - surt(s). +6. There is currently no brozzler option to disable the automatically generated + ``accept`` rules. Scope settings -------------- @@ -366,7 +366,9 @@ Scope settings +======+==========+=========+ | list | no | *none* | +------+----------+---------+ -List of scope rules. +List of scope rules. If any of the rules match, and the url is within +``max_hops`` from seed, and none of the ``block`` rules apply, the url is in +scope. ``blocks`` ~~~~~~~~~~~ @@ -375,7 +377,7 @@ List of scope rules. +======+==========+=========+ | list | no | *none* | +------+----------+---------+ -List of scope rules. +List of scope rules. If any of the rules match, the url is deemed out of scope. ``max_hops`` ~~~~~~~~~~~~ @@ -384,6 +386,7 @@ List of scope rules. +========+==========+=========+ | number | no | *none* | +--------+----------+---------+ +Maximum number of hops from seed. ``max_hops_off`` ~~~~~~~~~~~~~~~~ @@ -392,6 +395,8 @@ List of scope rules. +========+==========+=========+ | number | no | 0 | +--------+----------+---------+ +Expands the scope to include urls up to this many hops from the last page that +was in scope thanks to an ``accept`` rule. Scope rule conditions --------------------- @@ -403,6 +408,8 @@ Scope rule conditions +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +Matches if the host part of the canonicalized url is ``domain`` or a +subdomain. ``substring`` ~~~~~~~~~~~~~ @@ -411,6 +418,7 @@ Scope rule conditions +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +Matches if ``substring`` is found anywhere in the canonicalized url. ``regex`` ~~~~~~~~~ @@ -419,6 +427,7 @@ Scope rule conditions +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +Matches if the full canonicalized url matches ``regex``. ``ssurt`` ~~~~~~~~~ @@ -427,6 +436,7 @@ Scope rule conditions +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +Matches if the canonicalized url in SSURT [1]_ form starts with ``ssurt``. ``surt`` ~~~~~~~~ @@ -435,6 +445,7 @@ Scope rule conditions +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +Matches if the canonicalized url in SURT [2]_ form starts with ``surt``. ``parent_url_regex`` ~~~~~~~~~~~~~~~~~~~~ @@ -443,4 +454,8 @@ Scope rule conditions +========+==========+=========+ | string | no | *none* | +--------+----------+---------+ +Matches if the full canonicalized parent url matches ``regex``. The parent url +is the url of the page in which the link was found. +.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst +.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html From 1572fd3ed6d3fb0991ff86618541126b6aef9155 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 15 May 2018 16:52:48 -0700 Subject: [PATCH 17/24] missed a spot where is_permitted_by_robots needs monkeying --- tests/test_frontier.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 4906919..adf04b1 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -1000,10 +1000,15 @@ def test_max_hops_off(): assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False - # two of these are in scope because of max_hops_off - frontier.scope_and_schedule_outlinks(site, seed_page, [ - 'http://foo.org/', 'https://example.com/toot', - 'http://example.com/toot', 'https://some.bad.domain/something']) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + # two of these are in scope because of max_hops_off + frontier.scope_and_schedule_outlinks(site, seed_page, [ + 'http://foo.org/', 'https://example.com/toot', + 'http://example.com/toot', 'https://some.bad.domain/something']) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) @@ -1062,8 +1067,13 @@ def test_max_hops_off(): # next hop is past max_hops_off, but normal in scope url is in scope foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] - frontier.scope_and_schedule_outlinks(site, foo_page, [ - 'http://foo.org/bar', 'http://example.com/blah']) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, foo_page, [ + 'http://foo.org/bar', 'http://example.com/blah']) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert foo_page == { 'brozzle_count': 0, 'claimed': False, From b9b8dcd0626007ff850319df47369b7ab749ed21 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 14:19:23 -0700 Subject: [PATCH 18/24] backward compatibility for old scope["surt"] and make sure to store ssurt as string in rethinkdb --- brozzler/model.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 5e787dc..2ef9844 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -183,14 +183,24 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} + + # backward compatibility + if "surt" in self.scope: + if not "accepts" in self.scope: + self.scope["accepts"] = [] + self.scope["accepts"].append({"surt": self.scope["surt"]}) + del self.scope["surt"] + + # backward compatibility if ("max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope): self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] if "max_hops_off_surt" in self.scope: del self.scope["max_hops_off_surt"] + if self.seed: self._accept_ssurt_if_not_redundant( - brozzler.site_surt_canon(self.seed).ssurt()) + brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) if not "starts_and_stops" in self: if self.get("start_time"): # backward compatibility @@ -219,7 +229,7 @@ class Site(doublethink.Document, ElapsedMixIn): def note_seed_redirect(self, url): self._accept_ssurt_if_not_redundant( - brozzler.site_surt_canon(url).ssurt()) + brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) def extra_headers(self): hdrs = {} From 338d2e48f9888644f71d795e733abed142132764 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 14:26:32 -0700 Subject: [PATCH 19/24] update warcprox dependency to include recent fixes --- .travis.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index d8fd44f..8f36f3f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml -- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest +- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b2.dev173' pytest - chromium-browser --version - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser diff --git a/setup.py b/setup.py index e6ce051..46b58c9 100644 --- a/setup.py +++ b/setup.py @@ -79,7 +79,7 @@ setuptools.setup( extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], 'easy': [ - 'warcprox>=2.4b1.dev145', + 'warcprox>=2.4b2.dev173', 'pywb<2', 'flask>=0.11', 'gunicorn' From ac735639ffba5d428e4fc13a4be367d99ff29a46 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 14:41:49 -0700 Subject: [PATCH 20/24] incorporate urlcanon fix --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 46b58c9..68ef41f 100644 --- a/setup.py +++ b/setup.py @@ -69,7 +69,7 @@ setuptools.setup( 'requests', 'websocket-client!=0.39.0', 'pillow==3.3.0', - 'urlcanon>=0.1.dev16', + 'urlcanon>=0.1.dev23', 'doublethink>=0.2.0.dev88', 'rethinkdb>=2.3,<2.4', 'cerberus==1.0.1', From 399c097c7cac8f8e183412a824a5d7c314dfd9d1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 15:48:29 -0700 Subject: [PATCH 21/24] travis-ci install warcprox from github --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 8f36f3f..5cb807b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml -- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b2.dev173' pytest +- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest - chromium-browser --version - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser From 5bb392ec7c26836fdaf0b508bd85bb768a70c649 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 16:43:10 -0700 Subject: [PATCH 22/24] ssurts are strings now because they're friendlier that way in rethinkdb --- tests/test_cluster.py | 2 +- tests/test_frontier.py | 34 +++++++++++++++++----------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 41afbcb..623ec98 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -454,7 +454,7 @@ def test_seed_redirect(httpd): site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) - assert site.scope == {'accepts': [{'ssurt': ('localhost,//%s:http:/site5/redirect/' % httpd.server_port).encode('ascii')}]} + assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]} frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index adf04b1..d66773e 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -73,7 +73,7 @@ def test_basics(): 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': {'accepts': [{'ssurt': b'com,example,//http:/'}]}, + 'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]}, 'seed': 'http://example.com', 'starts_and_stops': [ { @@ -89,7 +89,7 @@ def test_basics(): 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': {'accepts': [{'ssurt': b'org,example,//https:/'}]}, + 'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]}, 'seed': 'https://example.org/', 'starts_and_stops': [ { @@ -439,7 +439,7 @@ def test_field_defaults(): brozzler.Site.table_ensure(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/'}) assert site.id is None - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/'}]} site.save() assert site.id assert site.scope @@ -633,15 +633,15 @@ def test_completed_page(): 'hops_from_seed': 0, 'redirect_url':'http://example.com/b/', }) page.save() - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} frontier.completed_page(site, page) assert site.scope == {'accepts': [ - {'ssurt': b'com,example,//http:/a/'}, - {'ssurt': b'com,example,//http:/b/'}]} + {'ssurt': 'com,example,//http:/a/'}, + {'ssurt': 'com,example,//http:/b/'}]} site.refresh() assert site.scope == {'accepts': [ - {'ssurt': b'com,example,//http:/a/'}, - {'ssurt': b'com,example,//http:/b/'}]} + {'ssurt': 'com,example,//http:/a/'}, + {'ssurt': 'com,example,//http:/b/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -660,11 +660,11 @@ def test_completed_page(): 'hops_from_seed': 0, 'redirect_url':'http://example.com/a/x/', }) page.save() - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} site.refresh() - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -682,11 +682,11 @@ def test_completed_page(): 'hops_from_seed': 1, 'redirect_url':'http://example.com/d/', }) page.save() - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} site.refresh() - assert site.scope == {'accepts': [{'ssurt': b'com,example,//http:/a/'}]} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -726,7 +726,7 @@ def test_hashtag_seed(): site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) - assert site.scope == {'accepts': [{'ssurt': b'org,example,//http:/'}]} + assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 @@ -737,7 +737,7 @@ def test_hashtag_seed(): site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) brozzler.new_site(frontier, site) - assert site.scope == {'accepts': [{'ssurt': b'org,example,//http:/'}]} + assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 @@ -985,7 +985,7 @@ def test_max_hops_off(): 'seed': 'http://example.com/', 'scope': { 'max_hops_off_surt': 1, - 'blocks': [{'ssurt': b'domain,bad,'}]}}) + 'blocks': [{'ssurt': 'domain,bad,'}]}}) brozzler.new_site(frontier, site) site.refresh() # get it back from the db From 67558528cb81878c73a6710eb31889b4e03c7215 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 16:43:38 -0700 Subject: [PATCH 23/24] fix bad copy/paste --- brozzler/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/model.py b/brozzler/model.py index 2ef9844..f3c8679 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -229,7 +229,7 @@ class Site(doublethink.Document, ElapsedMixIn): def note_seed_redirect(self, url): self._accept_ssurt_if_not_redundant( - brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) + brozzler.site_surt_canon(url).ssurt().decode('ascii')) def extra_headers(self): hdrs = {} From 331d07fe882e186ad8ba7cb14cc02a21e83a4a97 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 May 2018 17:11:08 -0700 Subject: [PATCH 24/24] these ssurts are strings too --- tests/test_cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 623ec98..0ec5026 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -479,8 +479,8 @@ def test_seed_redirect(httpd): # check that scope has been updated properly assert site.scope == {'accepts': [ - {'ssurt': ('localhost,//%s:http:/site5/redirect/' % httpd.server_port).encode('ascii')}, - {'ssurt': ('localhost,//%s:http:/site5/destination/' % httpd.server_port).encode('ascii')}]} + {'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}, + {'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]} def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()