mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #122 from nlevitt/new-job-bulk-inserts
improve performance of brozzler-new-job
This commit is contained in:
commit
d2b1843a6d
@ -84,35 +84,49 @@ def new_job(frontier, job_conf):
|
||||
job.save()
|
||||
|
||||
sites = []
|
||||
pages = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
merged_conf.pop("seeds")
|
||||
merged_conf["job_id"] = job.id
|
||||
merged_conf["seed"] = merged_conf.pop("url")
|
||||
site = brozzler.Site(frontier.rr, merged_conf)
|
||||
site.id = str(uuid.uuid4())
|
||||
sites.append(site)
|
||||
pages.append(new_seed_page(frontier, site))
|
||||
|
||||
for site in sites:
|
||||
new_site(frontier, site)
|
||||
# insert in batches to avoid this error
|
||||
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
||||
for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
|
||||
logging.info('inserting batch of %s pages', len(batch))
|
||||
result = frontier.rr.table('pages').insert(batch).run()
|
||||
for batch in (sites[i:i+100] for i in range(0, len(sites), 100)):
|
||||
logging.info('inserting batch of %s sites', len(batch))
|
||||
result = frontier.rr.table('sites').insert(batch).run()
|
||||
logging.info('job %s fully started', job.id)
|
||||
|
||||
return job
|
||||
|
||||
def new_seed_page(frontier, site):
|
||||
url = urlcanon.parse_url(site.seed)
|
||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": str(url), "site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||
"priority": 1000, "needs_robots_check": True})
|
||||
if hashtag:
|
||||
page.hashtags = [hashtag,]
|
||||
return page
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
logging.info("new site %s", site)
|
||||
site.id = site.id or str(uuid.uuid4())
|
||||
# insert the Page into the database before the Site, to avoid situation
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
try:
|
||||
url = urlcanon.parse_url(site.seed)
|
||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": str(url), "site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||
"priority": 1000, "needs_robots_check": True})
|
||||
if hashtag:
|
||||
page.hashtags = [hashtag,]
|
||||
page = new_seed_page(frontier, site)
|
||||
page.save()
|
||||
logging.info("queued page %s", page)
|
||||
finally:
|
||||
|
Loading…
x
Reference in New Issue
Block a user