From ca0053e3be28704008b8c98af4ce4f0027f9a73b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sat, 14 Nov 2015 03:10:58 +0000 Subject: [PATCH] also when adding new job, insert all sites before the job, to prevent brozzler workers thinking the job is finished before all the sites are in the db --- brozzler/frontier.py | 4 +++- brozzler/job.py | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 9a95a53..dd3ceb3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -63,7 +63,9 @@ class RethinkDbFrontier: self.logger.info("inserting into 'sites' table %s", site) result = self.r.table("sites").insert(site.to_dict()).run() self._vet_result(result, inserted=1) - site.id = result["generated_keys"][0] + if not site.id: + # only if "id" has not already been set + site.id = result["generated_keys"][0] def update_job(self, job): self.logger.debug("updating 'jobs' table entry %s", job) diff --git a/brozzler/job.py b/brozzler/job.py index e36f401..276f924 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -25,8 +25,8 @@ def new_job_file(frontier, job_conf_file): new_job(frontier, job_conf) def new_job(frontier, job_conf): - job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) - frontier.new_job(job) + job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE", + started=datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")) sites = [] for seed_conf in job_conf["seeds"]: @@ -48,9 +48,12 @@ def new_job(frontier, job_conf): extra_headers=extra_headers) sites.append(site) + # insert all the sites into database before the job for site in sites: new_site(frontier, site) + frontier.new_job(job) + def new_site(frontier, site): site.id = str(uuid.uuid4()) logging.info("new site {}".format(site))