mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-10 16:49:41 -04:00
when adding new job, insert the seed url Page document into the database before the Site, to avoid situation where brozzler worker claims the site, finds no pages to crawl, and decides the site is finished
This commit is contained in:
parent
21906f8cad
commit
3260fe4e9e
1 changed files with 16 additions and 7 deletions
|
@ -3,6 +3,7 @@ import brozzler
|
|||
import yaml
|
||||
import json
|
||||
import datetime
|
||||
import uuid
|
||||
|
||||
def merge(a, b):
|
||||
if isinstance(a, dict) and isinstance(b, dict):
|
||||
|
@ -51,15 +52,23 @@ def new_job(frontier, job_conf):
|
|||
new_site(frontier, site)
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
logging.info("new site {}".format(site))
|
||||
frontier.new_site(site)
|
||||
try:
|
||||
# insert the Page into the database before the Site, to avoid situation
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
try:
|
||||
if brozzler.is_permitted_by_robots(site, site.seed):
|
||||
page = brozzler.Page(site.seed, site_id=site.id, job_id=site.job_id, hops_from_seed=0, priority=1000)
|
||||
page = brozzler.Page(site.seed, site_id=site.id,
|
||||
job_id=site.job_id, hops_from_seed=0, priority=1000)
|
||||
frontier.new_page(page)
|
||||
logging.info("queued page %s", page)
|
||||
else:
|
||||
logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
||||
finally:
|
||||
# finally block because we want to insert the Site no matter what
|
||||
frontier.new_site(site)
|
||||
except brozzler.ReachedLimit as e:
|
||||
frontier.reached_limit(site, e)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue