From 01653c01d76e6fa9d486bc7d7952d09f84a9b94d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 7 Mar 2017 13:19:56 -0800 Subject: [PATCH] use updated doublethink library populate_defaults() to avoid problem where under certain circumstances field values from the database would be overwritten by defaults --- brozzler/job.py | 29 +++++----------- brozzler/site.py | 69 +++++++++++++++++++++----------------- setup.py | 4 +-- tests/test_frontier.py | 75 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 54 deletions(-) diff --git a/brozzler/job.py b/brozzler/job.py index 16adb8e..45ec31e 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -71,8 +71,8 @@ def new_job(frontier, job_conf): '''Returns new Job.''' validate_conf(job_conf) job = Job(frontier.rr, { - "conf": job_conf, - "status": "ACTIVE", "started": doublethink.utcnow()}) + "conf": job_conf, "status": "ACTIVE", + "started": doublethink.utcnow()}) if "id" in job_conf: job.id = job_conf["id"] job.save() @@ -80,22 +80,9 @@ def new_job(frontier, job_conf): sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) - site = brozzler.Site(frontier.rr, { - "job_id": job.id, - "seed": merged_conf["url"], - "scope": merged_conf.get("scope"), - "time_limit": merged_conf.get("time_limit"), - "proxy": merged_conf.get("proxy"), - "ignore_robots": merged_conf.get("ignore_robots"), - "enable_warcprox_features": merged_conf.get( - "enable_warcprox_features"), - "warcprox_meta": merged_conf.get("warcprox_meta"), - "metadata": merged_conf.get("metadata"), - "remember_outlinks": merged_conf.get("remember_outlinks"), - "user_agent": merged_conf.get("user_agent"), - "behavior_parameters": merged_conf.get("behavior_parameters"), - "username": merged_conf.get("username"), - "password": merged_conf.get("password")}) + merged_conf["job_id"] = job.id + merged_conf["seed"] = merged_conf.pop("url") + site = brozzler.Site(frontier.rr, merged_conf) sites.append(site) for site in sites: @@ -127,9 +114,9 @@ class Job(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) table = "jobs" - def __init__(self, rr, d={}): - doublethink.Document.__init__(self, rr, d) - self.status = self.get("status", "ACTIVE") + def populate_defaults(self): + if not "status" in self: + self.status = "ACTIVE" if not "starts_and_stops" in self: if self.get("started"): # backward compatibility self.starts_and_stops = [{ diff --git a/brozzler/site.py b/brozzler/site.py index 5406a70..d8bfbc7 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -92,28 +92,32 @@ class Site(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) table = 'sites' - def __init__(self, rr, d={}): - doublethink.Document.__init__(self, rr, d) - if not self.get('status'): - self.status = 'ACTIVE' - self.enable_warcprox_features = bool(self.get('enable_warcprox_features')) - self.claimed = bool(self.get('claimed')) - self.last_disclaimed = self.get('last_disclaimed', _EPOCH_UTC) - self.last_claimed = self.get('last_claimed', _EPOCH_UTC) - if not self.get('starts_and_stops'): - if self.get('start_time'): # backward compatibility + def populate_defaults(self): + if not "status" in self: + self.status = "ACTIVE" + if not "enable_warcprox_features" in self: + self.enable_warcprox_features = False + if not "claimed" in self: + self.claimed = False + if not "last_disclaimed" in self: + self.last_disclaimed = _EPOCH_UTC + if not "last_claimed" in self: + self.last_claimed = _EPOCH_UTC + if not "scope" in self: + self.scope = {} + if not "surt" in self.scope: + self.scope["surt"] = Url(self.seed).surt + + if not "starts_and_stops" in self: + if self.get("start_time"): # backward compatibility self.starts_and_stops = [{ - 'start':self.get('start_time'),'stop':None}] - if self.get('status') != 'ACTIVE': - self.starts_and_stops[0]['stop'] = self.last_disclaimed - del self['start_time'] + "start":self.get("start_time"),"stop":None}] + if self.get("status") != "ACTIVE": + self.starts_and_stops[0]["stop"] = self.last_disclaimed + del self["start_time"] else: self.starts_and_stops = [ - {'start':doublethink.utcnow(),'stop':None}] - if not self.scope: - self.scope = {} - if not 'surt' in self.scope: - self.scope['surt'] = Url(self.seed).surt + {"start":doublethink.utcnow(),"stop":None}] def __str__(self): return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed) @@ -286,19 +290,22 @@ class Site(doublethink.Document): class Page(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) - table = 'pages' + table = "pages" - def __init__(self, rr, d={}): - doublethink.Document.__init__(self, rr, d) - self.hops_from_seed = self.get('hops_from_seed', 0) - self.brozzle_count = self.get('brozzle_count', 0) - self.claimed = self.get('claimed', False) - self.hops_off_surt = self.get('hops_off_surt', 0) - self.needs_robots_check = self.get('needs_robots_check', False) - self._canon_hurl = None - - self.priority = self.get('priority', self._calc_priority()) - if self.get('id') is None: + def populate_defaults(self): + if not "hops_from_seed" in self: + self.hops_from_seed = 0 + if not "brozzle_count" in self: + self.brozzle_count = 0 + if not "claimed" in self: + self.claimed = False + if not "hops_off_surt" in self: + self.hops_off_surt = 0 + if not "needs_robots_check" in self: + self.needs_robots_check = False + if not "priority" in self: + self.priority = self._calc_priority() + if not "id" in self: digest_this = "site_id:%s,url:%s" % (self.site_id, self.url) self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest() diff --git a/setup.py b/setup.py index 71408b6..6a1887e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev199', + version='1.1b9.dev200', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -69,7 +69,7 @@ setuptools.setup( 'websocket-client!=0.39.0', 'pillow==3.3.0', 'surt>=0.3.0', - 'doublethink>=0.2.0.dev66', + 'doublethink>=0.2.0.dev70', 'rethinkdb>=2.3,<2.4', 'cerberus==1.0.1', 'jinja2', diff --git a/tests/test_frontier.py b/tests/test_frontier.py index cab85c0..23ea4f5 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -172,3 +172,78 @@ def test_time_limit(): assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start'] + +def test_field_defaults(): + rr = doublethink.Rethinker('localhost', db='ignoreme') + + # page + brozzler.Page.table_ensure(rr) + page = brozzler.Page(rr, {'hops_from_seed': 3}) + assert page.hops_from_seed == 3 + assert page.id is None + assert page.brozzle_count is None + page.save() + assert page.hops_from_seed == 3 + assert page.id + assert page.brozzle_count == 0 + + qage = brozzler.Page.load(rr, page.id) + assert qage.hops_from_seed == 3 + assert qage.id == page.id + assert qage.brozzle_count == 0 + qage.save() + assert qage.hops_from_seed == 3 + assert qage.id == page.id + assert qage.brozzle_count == 0 + qage.refresh() + assert qage.hops_from_seed == 3 + assert qage.id == page.id + assert qage.brozzle_count == 0 + + # site + brozzler.Site.table_ensure(rr) + site = brozzler.Site(rr, {'enable_warcprox_features': True}) + assert site.enable_warcprox_features is True + assert site.id is None + assert site.scope is None + site.save() + assert site.id + assert site.scope + + tite = brozzler.Site.load(rr, site.id) + assert tite.enable_warcprox_features is True + assert tite.id == site.id + assert tite.scope == site.scope + tite.save() + assert tite.enable_warcprox_features is True + assert tite.id == site.id + assert tite.scope == site.scope + tite.refresh() + assert tite.enable_warcprox_features is True + assert tite.id == site.id + assert tite.scope == site.scope + + # job + brozzler.Job.table_ensure(rr) + job = brozzler.Job(rr, {'status': 'WHUUUT'}) + assert job.status == 'WHUUUT' + assert job.id is None + assert job.starts_and_stops is None + job.save() + assert job.status == 'WHUUUT' + assert job.id + assert job.starts_and_stops + + kob = brozzler.Job.load(rr, job.id) + assert kob.status == 'WHUUUT' + assert kob.id + assert kob.starts_and_stops + kob.save() + assert kob.status == 'WHUUUT' + assert kob.id + assert kob.starts_and_stops + kob.refresh() + assert kob.status == 'WHUUUT' + assert kob.id + assert kob.starts_and_stops +