diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 5291bc0..cde8cad 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -153,6 +153,11 @@ def _remove_query(url): site_surt_canon = urlcanon.Canonicalizer( urlcanon.semantic.steps + [_remove_query]) +import doublethink +import datetime +EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( + tzinfo=doublethink.UTC) + from brozzler.site import Page, Site from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 772fca3..215ee6c 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -337,11 +337,20 @@ class RethinkDbFrontier: return None return brozzler.Page(self.rr, pages[0]) - def site_pages(self, site_id, unbrozzled_only=False): + def site_pages(self, site_id, brozzled=None): + ''' + Args: + site_id (str or int): + brozzled (bool): if true, results include only pages that have + been brozzled at least once; if false, only pages that have + not been brozzled; and if None (the default), all pages + Returns: + iterator of brozzler.Page + ''' results = self.rr.table("pages").between( - [site_id, 0 if unbrozzled_only else r.minval, + [site_id, 1 if brozzled is True else 0, r.minval, r.minval], - [site_id, 0 if unbrozzled_only else r.maxval, + [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval], index="priority_by_site").run() for result in results: diff --git a/brozzler/job.py b/brozzler/job.py index 45ec31e..0120dcb 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -80,6 +80,7 @@ def new_job(frontier, job_conf): sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) + merged_conf.pop("seeds") merged_conf["job_id"] = job.id merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) diff --git a/brozzler/site.py b/brozzler/site.py index a61ec59..3d7d470 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -26,9 +26,6 @@ import doublethink import datetime import re -_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( - tzinfo=doublethink.UTC) - class Site(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) table = 'sites' @@ -41,9 +38,9 @@ class Site(doublethink.Document): if not "claimed" in self: self.claimed = False if not "last_disclaimed" in self: - self.last_disclaimed = _EPOCH_UTC + self.last_disclaimed = brozzler.EPOCH_UTC if not "last_claimed" in self: - self.last_claimed = _EPOCH_UTC + self.last_claimed = brozzler.EPOCH_UTC if not "scope" in self: self.scope = {} if not "surt" in self.scope and self.seed: diff --git a/setup.py b/setup.py index 7ca6f6b..c943425 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev211', + version='1.1b9.dev212', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_frontier.py b/tests/test_frontier.py index d276439..7527d53 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -23,6 +23,7 @@ import logging import argparse import doublethink import time +import datetime args = argparse.Namespace() args.log_level = logging.INFO @@ -34,6 +35,119 @@ def test_rethinkdb_up(): tbls = rr.table_list().run() assert len(tbls) > 10 +def test_basics(): + rr = doublethink.Rethinker(db='ignoreme') + frontier = brozzler.RethinkDbFrontier(rr) + job_conf = {'seeds': [ + {'url': 'http://example.com'}, {'url': 'https://example.org/'}]} + job = brozzler.new_job(frontier, job_conf) + assert job.id + assert job.starts_and_stops + assert job.starts_and_stops[0]['start'] + assert job == { + 'id': job.id, + 'conf': { + 'seeds': [ + {'url': 'http://example.com'}, + {'url': 'https://example.org/'} + ] + }, + 'status': 'ACTIVE', + 'starts_and_stops': [ + { + 'start': job.starts_and_stops[0]['start'], + 'stop': None + } + ] + } + + sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) + assert len(sites) == 2 + assert sites[0].starts_and_stops[0]['start'] + assert sites[1].starts_and_stops[0]['start'] + assert sites[0] == { + 'claimed': False, + 'enable_warcprox_features': False, + 'id': sites[0].id, + 'job_id': job.id, + 'last_claimed': brozzler.EPOCH_UTC, + 'last_disclaimed': brozzler.EPOCH_UTC, + 'scope': { + 'surt': 'http://(com,example,)/' + }, + 'seed': 'http://example.com', + 'starts_and_stops': [ + { + 'start': sites[0].starts_and_stops[0]['start'], + 'stop': None + } + ], + 'status': 'ACTIVE' + } + assert sites[1] == { + 'claimed': False, + 'enable_warcprox_features': False, + 'id': sites[1].id, + 'job_id': job.id, + 'last_claimed': brozzler.EPOCH_UTC, + 'last_disclaimed': brozzler.EPOCH_UTC, + 'scope': { + 'surt': 'https://(org,example,)/', + }, + 'seed': 'https://example.org/', + 'starts_and_stops': [ + { + 'start': sites[1].starts_and_stops[0]['start'], + 'stop': None, + }, + ], + 'status': 'ACTIVE', + } + + pages = list(frontier.site_pages(sites[0].id)) + assert len(pages) == 1 + assert pages[0] == { + 'brozzle_count': 0, + 'claimed': False, + 'hops_from_seed': 0, + 'hops_off_surt': 0, + 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), + 'job_id': job.id, + 'needs_robots_check': True, + 'priority': 1000, + 'site_id': sites[0].id, + 'url': 'http://example.com', + } + pages = list(frontier.site_pages(sites[1].id)) + assert len(pages) == 1 + assert pages[0] == { + 'brozzle_count': 0, + 'claimed': False, + 'hops_from_seed': 0, + 'hops_off_surt': 0, + 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), + 'job_id': job.id, + 'needs_robots_check': True, + 'priority': 1000, + 'site_id': sites[1].id, + 'url': 'https://example.org/', + } + + # test "brozzled" parameter of frontier.site_pages + assert len(list(frontier.site_pages(sites[1].id))) == 1 + assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0 + assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1 + pages[0].brozzle_count = 1 + pages[0].save() + assert len(list(frontier.site_pages(sites[1].id))) == 1 + assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 + assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0 + pages[0].brozzle_count = 32819 + pages[0].save() + assert len(list(frontier.site_pages(sites[1].id))) == 1 + assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 + assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0 + def test_resume_job(): ''' Tests that the right stuff gets twiddled in rethinkdb when we "start" and