when adding new job, insert the seed url Page document into the database before the Site, to avoid situation where brozzler worker claims the site, finds no pages to crawl, and decides the site is finished

This commit is contained in:
Noah Levitt 2015-11-13 23:47:51 +00:00
parent 21906f8cad
commit 3260fe4e9e

View File

@ -3,6 +3,7 @@ import brozzler
import yaml
import json
import datetime
import uuid
def merge(a, b):
if isinstance(a, dict) and isinstance(b, dict):
@ -51,15 +52,23 @@ def new_job(frontier, job_conf):
new_site(frontier, site)
def new_site(frontier, site):
site.id = str(uuid.uuid4())
logging.info("new site {}".format(site))
frontier.new_site(site)
try:
# insert the Page into the database before the Site, to avoid situation
# where a brozzler worker immediately claims the site, finds no pages
# to crawl, and decides the site is finished
try:
if brozzler.is_permitted_by_robots(site, site.seed):
page = brozzler.Page(site.seed, site_id=site.id, job_id=site.job_id, hops_from_seed=0, priority=1000)
page = brozzler.Page(site.seed, site_id=site.id,
job_id=site.job_id, hops_from_seed=0, priority=1000)
frontier.new_page(page)
logging.info("queued page %s", page)
else:
logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
finally:
# finally block because we want to insert the Site no matter what
frontier.new_site(site)
except brozzler.ReachedLimit as e:
frontier.reached_limit(site, e)