Merge branch 'master' into qa

* master:
  bump version after merge
  fix another oversight
  ugh. oops
  Revert "add a github PR template for this repo"
  improve performance of brozzler-new-job
This commit is contained in:
Noah Levitt 2018-09-28 15:27:51 -07:00
commit a75632bd95
3 changed files with 27 additions and 26 deletions

View File

@ -1,13 +0,0 @@
## Motivation
<!-- How does this code change improve the world? -->
<!-- Could be a reference to an issue/ticket tracker (like JIRA) if both author and reviewer have access permissions -->
## Description
<!-- What exactly does this do? Could be the git commit message -->
## Testing and Deployment Plan
<!-- Are there automated tests? How can a reviewer verify the change, if applicable? -->
<!-- Any issues forseen deploying this to production? -->
<!-- Don't forget to cc: any person or group who should be aware of this PR, and to assign a reviewer -->

View File

@ -84,35 +84,49 @@ def new_job(frontier, job_conf):
job.save()
sites = []
pages = []
for seed_conf in job_conf["seeds"]:
merged_conf = merge(seed_conf, job_conf)
merged_conf.pop("seeds")
merged_conf["job_id"] = job.id
merged_conf["seed"] = merged_conf.pop("url")
site = brozzler.Site(frontier.rr, merged_conf)
site.id = str(uuid.uuid4())
sites.append(site)
pages.append(new_seed_page(frontier, site))
for site in sites:
new_site(frontier, site)
# insert in batches to avoid this error
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
logging.info('inserting batch of %s pages', len(batch))
result = frontier.rr.table('pages').insert(batch).run()
for batch in (sites[i:i+100] for i in range(0, len(sites), 100)):
logging.info('inserting batch of %s sites', len(batch))
result = frontier.rr.table('sites').insert(batch).run()
logging.info('job %s fully started', job.id)
return job
def new_seed_page(frontier, site):
url = urlcanon.parse_url(site.seed)
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
urlcanon.canon.remove_fragment(url)
page = brozzler.Page(frontier.rr, {
"url": str(url), "site_id": site.get("id"),
"job_id": site.get("job_id"), "hops_from_seed": 0,
"priority": 1000, "needs_robots_check": True})
if hashtag:
page.hashtags = [hashtag,]
return page
def new_site(frontier, site):
site.id = str(uuid.uuid4())
logging.info("new site %s", site)
site.id = site.id or str(uuid.uuid4())
# insert the Page into the database before the Site, to avoid situation
# where a brozzler worker immediately claims the site, finds no pages
# to crawl, and decides the site is finished
try:
url = urlcanon.parse_url(site.seed)
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
urlcanon.canon.remove_fragment(url)
page = brozzler.Page(frontier.rr, {
"url": str(url), "site_id": site.get("id"),
"job_id": site.get("job_id"), "hops_from_seed": 0,
"priority": 1000, "needs_robots_check": True})
if hashtag:
page.hashtags = [hashtag,]
page = new_seed_page(frontier, site)
page.save()
logging.info("queued page %s", page)
finally:

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.5.dev305',
version='1.5.dev306',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',