mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
also when adding new job, insert all sites before the job, to prevent brozzler workers thinking the job is finished before all the sites are in the db
This commit is contained in:
parent
3260fe4e9e
commit
ca0053e3be
@ -63,7 +63,9 @@ class RethinkDbFrontier:
|
||||
self.logger.info("inserting into 'sites' table %s", site)
|
||||
result = self.r.table("sites").insert(site.to_dict()).run()
|
||||
self._vet_result(result, inserted=1)
|
||||
site.id = result["generated_keys"][0]
|
||||
if not site.id:
|
||||
# only if "id" has not already been set
|
||||
site.id = result["generated_keys"][0]
|
||||
|
||||
def update_job(self, job):
|
||||
self.logger.debug("updating 'jobs' table entry %s", job)
|
||||
|
@ -25,8 +25,8 @@ def new_job_file(frontier, job_conf_file):
|
||||
new_job(frontier, job_conf)
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))
|
||||
frontier.new_job(job)
|
||||
job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||
started=datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"))
|
||||
|
||||
sites = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
@ -48,9 +48,12 @@ def new_job(frontier, job_conf):
|
||||
extra_headers=extra_headers)
|
||||
sites.append(site)
|
||||
|
||||
# insert all the sites into database before the job
|
||||
for site in sites:
|
||||
new_site(frontier, site)
|
||||
|
||||
frontier.new_job(job)
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
logging.info("new site {}".format(site))
|
||||
|
Loading…
x
Reference in New Issue
Block a user