diff --git a/.travis.yml b/.travis.yml index d8fd44f..5cb807b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml -- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest +- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest - chromium-browser --version - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser diff --git a/brozzler/frontier.py b/brozzler/frontier.py index a0f8ab4..2e076d3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -291,75 +291,80 @@ class RethinkDbFrontier: {"start":doublethink.utcnow(), "stop":None}) site.save() + def _build_fresh_page(self, site, parent_page, url, hops_off=0): + url_for_scoping = urlcanon.semantic(url) + url_for_crawling = urlcanon.whatwg(url) + hashtag = (url_for_crawling.hash_sign + + url_for_crawling.fragment).decode('utf-8') + urlcanon.canon.remove_fragment(url_for_crawling) + page = brozzler.Page(self.rr, { + 'url': str(url_for_crawling), + 'site_id': site.id, + 'job_id': site.job_id, + 'hops_from_seed': parent_page.hops_from_seed + 1, + 'via_page_id': parent_page.id, + 'hops_off_surt': hops_off, + 'hashtags': [hashtag] if hashtag else []}) + return page + + def _merge_page(self, existing_page, fresh_page): + ''' + Utility method for merging info from `brozzler.Page` instances + representing the same url but with possibly different metadata. + ''' + existing_page.priority += fresh_page.priority + existing_page.hashtags = list(set( + existing_page.hashtags + fresh_page.hashtags)) + existing_page.hops_off = min( + existing_page.hops_off, fresh_page.hops_off) + def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( - set of in scope urls (uncanonicalized) accepted by robots policy, + dict of {page_id: Page} of fresh `brozzler.Page` representing in + scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' - in_scope = set() + pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) - urlcanon.canon.remove_fragment(url_for_crawling) - if site.is_in_scope(url_for_scoping, parent_page=parent_page): + decision = site.accept_reject_or_neither( + url_for_scoping, parent_page=parent_page) + if decision is True: + hops_off = 0 + elif decision is None: + decision = parent_page.hops_off < site.scope.get( + 'max_hops_off', 0) + hops_off = parent_page.hops_off + 1 + if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): - in_scope.add(url) + fresh_page = self._build_fresh_page( + site, parent_page, url, hops_off) + if fresh_page.id in pages: + self._merge_page(pages[fresh_page.id], fresh_page) + else: + pages[fresh_page.id] = fresh_page else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) - return in_scope, blocked, out_of_scope - - def _build_fresh_pages(self, site, parent_page, urls): - ''' - Returns a dict of page_id => brozzler.Page. - ''' - pages = {} - for url in urls: - url_for_scoping = urlcanon.semantic(url) - url_for_crawling = urlcanon.whatwg(url) - hashtag = (url_for_crawling.hash_sign - + url_for_crawling.fragment).decode('utf-8') - urlcanon.canon.remove_fragment(url_for_crawling) - if not url_for_scoping.surt().startswith( - site.scope['surt'].encode('utf-8')): - hops_off_surt = parent_page.hops_off_surt + 1 - else: - hops_off_surt = 0 - page = brozzler.Page(self.rr, { - 'url': str(url_for_crawling), - 'site_id': site.id, - 'job_id': site.job_id, - 'hops_from_seed': parent_page.hops_from_seed + 1, - 'via_page_id': parent_page.id, - 'hops_off_surt': hops_off_surt, - 'hashtags': []}) - if page.id in pages: - pages[page.id].priority += page.priority - page = pages[page.id] - else: - pages[page.id] = page - if hashtag: - page.hashtags = list(set(page.hashtags + [hashtag])) - return pages + return pages, blocked, out_of_scope def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {'accepted':set(),'blocked':set(),'rejected':set()} counts = {'added':0,'updated':0,'rejected':0,'blocked':0} - in_scope, blocked, out_of_scope = self._scope_and_enforce_robots( + fresh_pages, blocked, out_of_scope = self._scope_and_enforce_robots( site, parent_page, outlinks) decisions['blocked'] = blocked decisions['rejected'] = out_of_scope counts['blocked'] += len(blocked) counts['rejected'] += len(out_of_scope) - fresh_pages = self._build_fresh_pages(site, parent_page, in_scope) - # get existing pages from rethinkdb results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 14445bc..6069de8 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -65,7 +65,7 @@ id: max_hops: type: integer - max_hops_off_surt: + max_hops_off: type: integer metadata: diff --git a/brozzler/model.py b/brozzler/model.py index 9c1a60f..f3c8679 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -99,7 +99,7 @@ def new_job(frontier, job_conf): def new_site(frontier, site): site.id = str(uuid.uuid4()) - logging.info("new site {}".format(site)) + logging.info("new site %s", site) # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished @@ -183,9 +183,24 @@ class Site(doublethink.Document, ElapsedMixIn): self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} - if not "surt" in self.scope and self.seed: - self.scope["surt"] = brozzler.site_surt_canon( - self.seed).surt().decode('ascii') + + # backward compatibility + if "surt" in self.scope: + if not "accepts" in self.scope: + self.scope["accepts"] = [] + self.scope["accepts"].append({"surt": self.scope["surt"]}) + del self.scope["surt"] + + # backward compatibility + if ("max_hops_off_surt" in self.scope + and not "max_hops_off" in self.scope): + self.scope["max_hops_off"] = self.scope["max_hops_off_surt"] + if "max_hops_off_surt" in self.scope: + del self.scope["max_hops_off_surt"] + + if self.seed: + self._accept_ssurt_if_not_redundant( + brozzler.site_surt_canon(self.seed).ssurt().decode('ascii')) if not "starts_and_stops" in self: if self.get("start_time"): # backward compatibility @@ -201,12 +216,20 @@ class Site(doublethink.Document, ElapsedMixIn): def __str__(self): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) + def _accept_ssurt_if_not_redundant(self, ssurt): + if not "accepts" in self.scope: + self.scope["accepts"] = [] + simple_rule_ssurts = ( + rule["ssurt"] for rule in self.scope["accepts"] + if set(rule.keys()) == {'ssurt'}) + if not any(ssurt.startswith(ss) for ss in simple_rule_ssurts): + self.logger.info( + "adding ssurt %s to scope accept rules", ssurt) + self.scope["accepts"].append({"ssurt": ssurt}) + def note_seed_redirect(self, url): - new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii") - if not new_scope_surt.startswith(self.scope["surt"]): - self.logger.info("changing site scope surt from {} to {}".format( - self.scope["surt"], new_scope_surt)) - self.scope["surt"] = new_scope_surt + self._accept_ssurt_if_not_redundant( + brozzler.site_surt_canon(url).ssurt().decode('ascii')) def extra_headers(self): hdrs = {} @@ -215,9 +238,20 @@ class Site(doublethink.Document, ElapsedMixIn): self.warcprox_meta, separators=(',', ':')) return hdrs - def is_in_scope(self, url, parent_page=None): + def accept_reject_or_neither(self, url, parent_page=None): + ''' + Returns `True` (accepted), `False` (rejected), or `None` (no decision). + + `None` usually means rejected, unless `max_hops_off` comes into play. + ''' if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) + + if not url.scheme in (b'http', b'https'): + # XXX doesn't belong here maybe (where? worker ignores unknown + # schemes?) + return False + try_parent_urls = [] if parent_page: try_parent_urls.append(urlcanon.semantic(parent_page.url)) @@ -225,44 +259,36 @@ class Site(doublethink.Document, ElapsedMixIn): try_parent_urls.append( urlcanon.semantic(parent_page.redirect_url)) - might_accept = False - if not url.scheme in (b'http', b'https'): - # XXX doesn't belong here maybe (where? worker ignores unknown - # schemes?) - return False - elif (parent_page and "max_hops" in self.scope + # enforce max_hops + if (parent_page and "max_hops" in self.scope and parent_page.hops_from_seed >= self.scope["max_hops"]): - pass - elif url.surt().startswith(self.scope["surt"].encode("utf-8")): - might_accept = True - elif parent_page and parent_page.hops_off_surt < self.scope.get( - "max_hops_off_surt", 0): - might_accept = True - elif "accepts" in self.scope: - for accept_rule in self.scope["accepts"]: - rule = urlcanon.MatchRule(**accept_rule) + return False + + # enforce reject rules + if "blocks" in self.scope: + for block_rule in self.scope["blocks"]: + rule = urlcanon.MatchRule(**block_rule) if try_parent_urls: for parent_url in try_parent_urls: if rule.applies(url, parent_url): - might_accept = True + return False else: if rule.applies(url): - might_accept = True + return False - if might_accept: - if "blocks" in self.scope: - for block_rule in self.scope["blocks"]: - rule = urlcanon.MatchRule(**block_rule) - if try_parent_urls: - for parent_url in try_parent_urls: - if rule.applies(url, parent_url): - return False - else: - if rule.applies(url): - return False - return True - else: - return False + # honor accept rules + for accept_rule in self.scope["accepts"]: + rule = urlcanon.MatchRule(**accept_rule) + if try_parent_urls: + for parent_url in try_parent_urls: + if rule.applies(url, parent_url): + return True + else: + if rule.applies(url): + return True + + # no decision if we reach here + return None class Page(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) @@ -280,8 +306,12 @@ class Page(doublethink.Document): self.brozzle_count = 0 if not "claimed" in self: self.claimed = False - if not "hops_off_surt" in self: - self.hops_off_surt = 0 + if "hops_off_surt" in self and not "hops_off" in self: + self.hops_off = self.hops_off_surt + if "hops_off_surt" in self: + del self["hops_off_surt"] + if not "hops_off" in self: + self.hops_off = 0 if not "needs_robots_check" in self: self.needs_robots_check = False if not "priority" in self: diff --git a/job-conf.rst b/job-conf.rst index 1174f1a..1fa5bc6 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -1,17 +1,19 @@ -brozzler job configuration +Brozzler Job Configuration ************************** -Jobs are defined using yaml files. Options may be specified either at the -top-level or on individual seeds. At least one seed url must be specified, +Jobs are defined using yaml files. At least one seed url must be specified, everything else is optional. -an example -========== +.. contents:: + +Example +======= :: id: myjob time_limit: 60 # seconds + proxy: 127.0.0.1:8000 # point at warcprox for archiving ignore_robots: false max_claimed_sites: 2 warcprox_meta: @@ -35,15 +37,14 @@ an example scope: surt: http://(org,example, -how inheritance works +How inheritance works ===================== -Most of the available options apply to seeds. Such options can also be -specified at the top level, in which case the seeds inherit the options. If -an option is specified both at the top level and at the level of an individual -seed, the results are merged with the seed-level value taking precedence in -case of conflicts. It's probably easiest to make sense of this by way of an -example. +Most of the settings that apply to seeds can also be specified at the top +level, in which case all seeds inherit those settings. If an option is +specified both at the top level and at seed level, the results are merged with +the seed-level value taking precedence in case of conflicts. It's probably +easiest to make sense of this by way of an example. In the example yaml above, ``warcprox_meta`` is specified at the top level and at the seed level for the seed http://one.example.org/. At the top level we @@ -79,101 +80,150 @@ Notice that: - Since ``buckets`` is a list, the merged result includes all the values from both the top level and the seed level. -settings reference -================== +Settings +======== + +Top-level settings +------------------ ``id`` ------- -+-----------+--------+----------+--------------------------+ -| scope | type | required | default | -+===========+========+==========+==========================+ -| top-level | string | no | *generated by rethinkdb* | -+-----------+--------+----------+--------------------------+ +~~~~~~ ++--------+----------+--------------------------+ +| type | required | default | ++========+==========+==========================+ +| string | no | *generated by rethinkdb* | ++--------+----------+--------------------------+ An arbitrary identifier for this job. Must be unique across this deployment of brozzler. -``seeds`` ---------- -+-----------+------------------------+----------+---------+ -| scope | type | required | default | -+===========+========================+==========+=========+ -| top-level | list (of dictionaries) | yes | *n/a* | -+-----------+------------------------+----------+---------+ -List of seeds. Each item in the list is a dictionary (associative array) which -defines the seed. It must specify ``url`` (see below) and can additionally -specify any of the settings of scope *seed-level*. - ``max_claimed_sites`` ---------------------- -+-----------+--------+----------+---------+ -| scope | type | required | default | -+===========+========+==========+=========+ -| top-level | number | no | *none* | -+-----------+--------+----------+---------+ +~~~~~~~~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | *none* | ++--------+----------+---------+ Puts a cap on the number of sites belonging to a given job that can be brozzled simultaneously across the cluster. Addresses the problem of a job with many seeds starving out other jobs. +``seeds`` +~~~~~~~~~ ++------------------------+----------+---------+ +| type | required | default | ++========================+==========+=========+ +| list (of dictionaries) | yes | *n/a* | ++------------------------+----------+---------+ +List of seeds. Each item in the list is a dictionary (associative array) which +defines the seed. It must specify ``url`` (see below) and can additionally +specify any seed settings. + +Seed-level-only settings +------------------------ +These settings can be specified only at the seed level, unlike most seed +settings, which can also be specified at the top level. + ``url`` -------- -+------------+--------+----------+---------+ -| scope | type | required | default | -+============+========+==========+=========+ -| seed-level | string | yes | *n/a* | -+------------+--------+----------+---------+ -The seed url. +~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | yes | *n/a* | ++--------+----------+---------+ +The seed url. Crawling starts here. + +``username`` +~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +If set, used to populate automatically detected login forms. See explanation at +"password" below. + +``password`` +~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +If set, used to populate automatically detected login forms. If ``username`` +and ``password`` are configured for a seed, brozzler will look for a login form +on each page it crawls for that seed. A form that has a single text or email +field (the username), a single password field (````), +and has ``method="POST"`` is considered to be a login form. The form may have +other fields like checkboxes and hidden fields. For these, brozzler will leave +the default values in place. Brozzler submits login forms after page load. +Then brozzling proceeds as usual. + +Seed-level / top-level settings +------------------------------- +These are seed settings that can also be speficied at the top level, in which +case they are inherited by all seeds. ``metadata`` ------------- -+-----------------------+------------+----------+---------+ -| scope | type | required | default | -+=======================+============+==========+=========+ -| seed-level, top-level | dictionary | no | *none* | -+-----------------------+------------+----------+---------+ +~~~~~~~~~~~~ ++------------+----------+---------+ +| type | required | default | ++============+==========+=========+ +| dictionary | no | *none* | ++------------+----------+---------+ Arbitrary information about the crawl job or site. Merely informative, not used by brozzler for anything. Could be of use to some external process. ``time_limit`` --------------- -+-----------------------+--------+----------+---------+ -| scope | type | required | default | -+=======================+========+==========+=========+ -| seed-level, top-level | number | no | *none* | -+-----------------------+--------+----------+---------+ -Time limit in seconds. If not specified, there no time limit. Time limit is +~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | *none* | ++--------+----------+---------+ +Time limit in seconds. If not specified, there is no time limit. Time limit is enforced at the seed level. If a time limit is specified at the top level, it is inherited by each seed as described above, and enforced individually on each seed. +``proxy`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +HTTP proxy, with the format ``host:port``. Typically configured to point to +warcprox for archival crawling. + ``ignore_robots`` ------------------ -+-----------------------+---------+----------+-----------+ -| scope | type | required | default | -+=======================+=========+==========+===========+ -| seed-level, top-level | boolean | no | ``false`` | -+-----------------------+---------+----------+-----------+ +~~~~~~~~~~~~~~~~~ ++---------+----------+-----------+ +| type | required | default | ++=========+==========+===========+ +| boolean | no | ``false`` | ++---------+----------+-----------+ If set to ``true``, brozzler will happily crawl pages that would otherwise be blocked by robots.txt rules. ``user_agent`` --------------- -+-----------------------+---------+----------+---------+ -| scope | type | required | default | -+=======================+=========+==========+=========+ -| seed-level, top-level | string | no | *none* | -+-----------------------+---------+----------+---------+ +~~~~~~~~~~~~~~ ++---------+----------+---------+ +| type | required | default | ++=========+==========+=========+ +| string | no | *none* | ++---------+----------+---------+ The ``User-Agent`` header brozzler will send to identify itself to web servers. It's good ettiquette to include a project URL with a notice to webmasters that explains why you're crawling, how to block the crawler robots.txt and how to contact the operator if the crawl is causing problems. ``warcprox_meta`` ------------------ -+-----------------------+------------+----------+-----------+ -| scope | type | required | default | -+=======================+============+==========+===========+ -| seed-level, top-level | dictionary | no | ``false`` | -+-----------------------+------------+----------+-----------+ +~~~~~~~~~~~~~~~~~ ++------------+----------+-----------+ +| type | required | default | ++============+==========+===========+ +| dictionary | no | ``false`` | ++------------+----------+-----------+ Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is configured. The value of the Warcprox-Meta header is a json blob. It is used to pass settings and information to warcprox. Warcprox does not forward the header @@ -195,36 +245,217 @@ becomes:: Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}} ``scope`` ---------- -+-----------------------+------------+----------+-----------+ -| scope | type | required | default | -+=======================+============+==========+===========+ -| seed-level, top-level | dictionary | no | ``false`` | -+-----------------------+------------+----------+-----------+ -Scope rules. *TODO* +~~~~~~~~~ ++------------+----------+-----------+ +| type | required | default | ++============+==========+===========+ +| dictionary | no | ``false`` | ++------------+----------+-----------+ +Scope specificaion for the seed. See the "Scoping" section which follows. -``surt`` --------- -+-------------+--------+----------+---------------------------+ -| scope | type | required | default | -+=============+========+==========+===========================+ -| scope-level | string | no | *generated from seed url* | -+-------------+--------+----------+---------------------------+ +Scoping +======= + +The scope of a seed determines which links are scheduled for crawling and which +are not. Example:: + + scope: + accepts: + - ssurt: com,example,//https:/ + - parent_url_regex: ^https?://(www\.)?youtube.com/(user|channel)/.*$ + regex: ^https?://(www\.)?youtube.com/watch\?.*$ + - surt: http://(com,google,video, + - surt: http://(com,googlevideo, + blocks: + - domain: youngscholars.unimelb.edu.au + substring: wp-login.php?action=logout + - domain: malware.us + max_hops: 20 + max_hops_off: 0 + +Toward the end of the process of brozzling a page, brozzler obtains a list of +navigational links (```` and similar) on the page, and evaluates +each link to determine whether it is in scope or out of scope for the crawl. +Then, newly discovered links that are in scope are scheduled to be crawled, and +previously discovered links get a priority bump. + +How brozzler applies scope rules +-------------------------------- + +Each scope rule has one or more conditions. If all of the conditions match, +then the scope rule as a whole matches. For example:: + + - domain: youngscholars.unimelb.edu.au + substring: wp-login.php?action=logout + +This rule applies if the domain of the url is "youngscholars.unimelb.edu.au" or +a subdomain, and the string "wp-login.php?action=logout" is found somewhere in +the url. + +Brozzler applies these logical steps to decide whether a url is in or out of +scope: + +1. If the number of hops from seed is greater than ``max_hops``, the url is + **out of scope**. +2. Otherwise, if any ``block`` rule matches, the url is **out of scope**. +3. Otherwise, if any ``accept`` rule matches, the url is **in scope**. +4. Otherwise, if the url is at most ``max_hops_off`` hops from the last page + that was in scope thanks to an ``accept`` rule, the url is **in scope**. +5. Otherwise (no rules match), the url is **out of scope**. + +Notably, ``block`` rules take precedence over ``accept`` rules. + +It may also be helpful to think about a list of scope rules as a boolean +expression. For example:: + + blocks: + - domain: youngscholars.unimelb.edu.au + substring: wp-login.php?action=logout + - domain: malware.us + +means block the url IF:: + + ("domain: youngscholars.unimelb.edu.au" AND "substring: wp-login.php?action=logout") OR "domain: malware.us" + +Automatic scoping based on seed urls +------------------------------------ +Brozzler usually generates an ``accept`` scope rule based on the seed url. It +does this to fulfill the usual expectation that everything "under" the seed +will be crawled. + +To generate the rule, brozzler canonicalizes the seed url using the `urlcanon +`_ library's "semantic" canonicalizer, then +removes the query string if any, and finally serializes the result in SSURT +[1]_ form. For example, a seed url of +``https://www.EXAMPLE.com:443/foo//bar?a=b&c=d#fdiap`` becomes +``com,example,www,//https:/foo/bar?a=b&c=d``. + +If the url in the browser location bar at the end of brozzling the seed page +differs from the seed url, brozzler automatically adds a second ``accept`` rule +to ensure the site is in scope, as if the new url were the original seed url. +It does this so that, for example, if ``http://example.com/`` redirects to +``http://www.example.com/``, the rest of the ``www.example.com`` is in scope. + +Brozzler derives its general approach to the seed surt from Heritrix, but +differs in a few respects. + +1. Unlike heritrix, brozzler does not strip the path segment after the last + slash. +2. Canonicalization does not attempt to match heritrix exactly, though it + usually does match. +3. When generating a surt for an https url, heritrix changes the scheme to + http. For example, the heritrix surt for ``https://www.example.com/`` is + ``http://(com,example,www,)`` and this means that all of + ``http://www.example.com/*`` and ``https://www.example.com/*`` are in + scope. It also means that a manually specified surt with scheme "https" does + not match anything. Brozzler does no scheme munging. +4. Brozzler identifies seed "redirects" by retrieving the url from the + browser's location bar at the end of brozzling the seed page, whereas + heritrix follows http 3xx redirects. +5. Brozzler uses ssurt instead of surt. +6. There is currently no brozzler option to disable the automatically generated + ``accept`` rules. + +Scope settings +-------------- ``accepts`` ------------ -+-------------+------+----------+---------+ -| scope | type | required | default | -+=============+======+==========+=========+ -| scope-level | list | no | *none* | -+-------------+------+----------+---------+ +~~~~~~~~~~~ ++------+----------+---------+ +| type | required | default | ++======+==========+=========+ +| list | no | *none* | ++------+----------+---------+ +List of scope rules. If any of the rules match, and the url is within +``max_hops`` from seed, and none of the ``block`` rules apply, the url is in +scope. ``blocks`` ------------ -+-------------+------+----------+---------+ -| scope | type | required | default | -+=============+======+==========+=========+ -| scope-level | list | no | *none* | -+-------------+------+----------+---------+ +~~~~~~~~~~~ ++------+----------+---------+ +| type | required | default | ++======+==========+=========+ +| list | no | *none* | ++------+----------+---------+ +List of scope rules. If any of the rules match, the url is deemed out of scope. +``max_hops`` +~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | *none* | ++--------+----------+---------+ +Maximum number of hops from seed. +``max_hops_off`` +~~~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| number | no | 0 | ++--------+----------+---------+ +Expands the scope to include urls up to this many hops from the last page that +was in scope thanks to an ``accept`` rule. + +Scope rule conditions +--------------------- + +``domain`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +Matches if the host part of the canonicalized url is ``domain`` or a +subdomain. + +``substring`` +~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +Matches if ``substring`` is found anywhere in the canonicalized url. + +``regex`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +Matches if the full canonicalized url matches ``regex``. + +``ssurt`` +~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +Matches if the canonicalized url in SSURT [1]_ form starts with ``ssurt``. + +``surt`` +~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +Matches if the canonicalized url in SURT [2]_ form starts with ``surt``. + +``parent_url_regex`` +~~~~~~~~~~~~~~~~~~~~ ++--------+----------+---------+ +| type | required | default | ++========+==========+=========+ +| string | no | *none* | ++--------+----------+---------+ +Matches if the full canonicalized parent url matches ``regex``. The parent url +is the url of the page in which the link was found. + +.. [1] SSURT is described at https://github.com/iipc/urlcanon/blob/master/ssurt.rst +.. [2] SURT is described at http://crawler.archive.org/articles/user_manual/glossary.html diff --git a/setup.py b/setup.py index 289d224..e7f7da3 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - brozzler setup script -Copyright (C) 2014-2017 Internet Archive +Copyright (C) 2014-2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -69,8 +69,8 @@ setuptools.setup( 'requests', 'websocket-client!=0.39.0', 'pillow==3.3.0', - 'urlcanon>=0.1.dev16', - 'doublethink>=0.2.0.dev81', + 'urlcanon>=0.1.dev23', + 'doublethink>=0.2.0.dev88', 'rethinkdb>=2.3,<2.4', 'cerberus==1.0.1', 'jinja2', @@ -79,7 +79,7 @@ setuptools.setup( extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], 'easy': [ - 'warcprox>=2.4b1.dev145', + 'warcprox>=2.4b2.dev173', 'pywb<2', 'flask>=0.11', 'gunicorn' diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 2559e07..0ec5026 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -448,13 +448,13 @@ def test_login(httpd): assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url def test_seed_redirect(httpd): - test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() + test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) - assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port + assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]} frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -478,7 +478,9 @@ def test_seed_redirect(httpd): assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port # check that scope has been updated properly - assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port + assert site.scope == {'accepts': [ + {'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}, + {'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]} def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 9b80a4f..d66773e 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -73,9 +73,7 @@ def test_basics(): 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': { - 'surt': 'http://(com,example,)/' - }, + 'scope': {'accepts': [{'ssurt': 'com,example,//http:/'}]}, 'seed': 'http://example.com', 'starts_and_stops': [ { @@ -91,9 +89,7 @@ def test_basics(): 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, - 'scope': { - 'surt': 'https://(org,example,)/', - }, + 'scope': {'accepts': [{'ssurt': 'org,example,//https:/'}]}, 'seed': 'https://example.org/', 'starts_and_stops': [ { @@ -110,7 +106,7 @@ def test_basics(): 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, - 'hops_off_surt': 0, + 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), 'job_id': job.id, 'needs_robots_check': True, @@ -124,7 +120,7 @@ def test_basics(): 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, - 'hops_off_surt': 0, + 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), 'job_id': job.id, 'needs_robots_check': True, @@ -443,8 +439,7 @@ def test_field_defaults(): brozzler.Site.table_ensure(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/'}) assert site.id is None - assert site.scope - assert site.scope['surt'] == 'http://(com,example,)/' + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/'}]} site.save() assert site.id assert site.scope @@ -638,11 +633,15 @@ def test_completed_page(): 'hops_from_seed': 0, 'redirect_url':'http://example.com/b/', }) page.save() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'surt': 'http://(com,example,)/b/'} + assert site.scope == {'accepts': [ + {'ssurt': 'com,example,//http:/a/'}, + {'ssurt': 'com,example,//http:/b/'}]} site.refresh() - assert site.scope == {'surt': 'http://(com,example,)/b/'} + assert site.scope == {'accepts': [ + {'ssurt': 'com,example,//http:/a/'}, + {'ssurt': 'com,example,//http:/b/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -661,11 +660,11 @@ def test_completed_page(): 'hops_from_seed': 0, 'redirect_url':'http://example.com/a/x/', }) page.save() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} site.refresh() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -683,11 +682,11 @@ def test_completed_page(): 'hops_from_seed': 1, 'redirect_url':'http://example.com/d/', }) page.save() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} frontier.completed_page(site, page) - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} site.refresh() - assert site.scope == {'surt': 'http://(com,example,)/a/'} + assert site.scope == {'accepts': [{'ssurt': 'com,example,//http:/a/'}]} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() @@ -727,7 +726,7 @@ def test_hashtag_seed(): site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) - assert site.scope['surt'] == 'http://(org,example,)/' + assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 @@ -738,7 +737,7 @@ def test_hashtag_seed(): site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) brozzler.new_site(frontier, site) - assert site.scope['surt'] == 'http://(org,example,)/' + assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 @@ -908,7 +907,7 @@ def test_choose_warcprox(): svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) - # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 + # avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run() @@ -978,3 +977,136 @@ def test_choose_warcprox(): # clean up rr.table('sites').delete().run() rr.table('services').delete().run() + +def test_max_hops_off(): + rr = doublethink.Rethinker('localhost', db='ignoreme') + frontier = brozzler.RethinkDbFrontier(rr) + site = brozzler.Site(rr, { + 'seed': 'http://example.com/', + 'scope': { + 'max_hops_off_surt': 1, + 'blocks': [{'ssurt': 'domain,bad,'}]}}) + brozzler.new_site(frontier, site) + site.refresh() # get it back from the db + + # renamed this param + assert not 'max_hops_off_surt' in site.scope + assert site.scope['max_hops_off'] == 1 + + seed_page = frontier.seed_page(site.id) + + assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None + assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None + assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True + assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False + + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + # two of these are in scope because of max_hops_off + frontier.scope_and_schedule_outlinks(site, seed_page, [ + 'http://foo.org/', 'https://example.com/toot', + 'http://example.com/toot', 'https://some.bad.domain/something']) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + + pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) + + assert len(pages) == 4 + assert pages[0].url == 'http://example.com/' + assert pages[0].hops_off == 0 + assert not 'hops_off_surt' in pages[0] + assert set(pages[0].outlinks['accepted']) == { + 'https://example.com/toot', 'http://foo.org/', + 'http://example.com/toot'} + assert pages[0].outlinks['blocked'] == [] + assert pages[0].outlinks['rejected'] == [ + 'https://some.bad.domain/something'] + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 0, + 'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'http://example.com/toot', + 'via_page_id': seed_page.id + } in pages + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 1, + 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'http://foo.org/', + 'via_page_id': seed_page.id + } in pages + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 1, + 'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'https://example.com/toot', + 'via_page_id': seed_page.id + } in pages + + # next hop is past max_hops_off, but normal in scope url is in scope + foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, foo_page, [ + 'http://foo.org/bar', 'http://example.com/blah']) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + assert foo_page == { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 1, + 'hops_off': 1, + 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 12, + 'site_id': site.id, + 'url': 'http://foo.org/', + 'via_page_id': seed_page.id, + 'outlinks': { + 'accepted': ['http://example.com/blah'], + 'blocked': [], + 'rejected': ['http://foo.org/bar'], + } + } + pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) + assert len(pages) == 5 + assert { + 'brozzle_count': 0, + 'claimed': False, + 'hashtags': [], + 'hops_from_seed': 2, + 'hops_off': 0, + 'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'), + 'job_id': None, + 'needs_robots_check': False, + 'priority': 11, + 'site_id': site.id, + 'url': 'http://example.com/blah', + 'via_page_id': foo_page.id + } in pages + diff --git a/tests/test_units.py b/tests/test_units.py index ce5067c..eed034e 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -94,28 +94,28 @@ blocks: 'url': 'http://example.com/foo/bar?baz=quux#monkey', 'site_id': site.id}) - assert site.is_in_scope('http://example.com/foo/bar', page) - assert not site.is_in_scope('http://example.com/foo/baz', page) + assert site.accept_reject_or_neither('http://example.com/foo/bar', page) is True + assert site.accept_reject_or_neither('http://example.com/foo/baz', page) is None - assert not site.is_in_scope('http://foo.com/some.mp3', page) - assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page) + assert site.accept_reject_or_neither('http://foo.com/some.mp3', page) is None + assert site.accept_reject_or_neither('http://foo.com/blah/audio_file/some.mp3', page) is True - assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page) - assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page) + assert site.accept_reject_or_neither('http://a.b.vimeocdn.com/blahblah', page) is True + assert site.accept_reject_or_neither('https://a.b.vimeocdn.com/blahblah', page) is None - assert site.is_in_scope('https://twitter.com/twit', page) - assert site.is_in_scope('https://twitter.com/twit?lang=en', page) - assert not site.is_in_scope('https://twitter.com/twit?lang=es', page) + assert site.accept_reject_or_neither('https://twitter.com/twit', page) is True + assert site.accept_reject_or_neither('https://twitter.com/twit?lang=en', page) is True + assert site.accept_reject_or_neither('https://twitter.com/twit?lang=es', page) is False - assert site.is_in_scope('https://www.facebook.com/whatevz', page) + assert site.accept_reject_or_neither('https://www.facebook.com/whatevz', page) is True - assert not site.is_in_scope( - 'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) + assert site.accept_reject_or_neither( + 'https://www.youtube.com/watch?v=dUIn5OAPS5s', page) is None yt_user_page = brozzler.Page(None, { 'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO', 'site_id': site.id, 'hops_from_seed': 10}) - assert site.is_in_scope( - 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) + assert site.accept_reject_or_neither( + 'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page) is True def test_proxy_down(): '''