mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: bump version after merge fix another oversight ugh. oops Revert "add a github PR template for this repo" improve performance of brozzler-new-job
This commit is contained in:
commit
a75632bd95
13
.github/PULL_REQUEST_TEMPLATE.md
vendored
13
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,13 +0,0 @@
|
||||
## Motivation
|
||||
<!-- How does this code change improve the world? -->
|
||||
<!-- Could be a reference to an issue/ticket tracker (like JIRA) if both author and reviewer have access permissions -->
|
||||
|
||||
## Description
|
||||
<!-- What exactly does this do? Could be the git commit message -->
|
||||
|
||||
## Testing and Deployment Plan
|
||||
<!-- Are there automated tests? How can a reviewer verify the change, if applicable? -->
|
||||
<!-- Any issues forseen deploying this to production? -->
|
||||
|
||||
|
||||
<!-- Don't forget to cc: any person or group who should be aware of this PR, and to assign a reviewer -->
|
@ -84,35 +84,49 @@ def new_job(frontier, job_conf):
|
||||
job.save()
|
||||
|
||||
sites = []
|
||||
pages = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
merged_conf.pop("seeds")
|
||||
merged_conf["job_id"] = job.id
|
||||
merged_conf["seed"] = merged_conf.pop("url")
|
||||
site = brozzler.Site(frontier.rr, merged_conf)
|
||||
site.id = str(uuid.uuid4())
|
||||
sites.append(site)
|
||||
pages.append(new_seed_page(frontier, site))
|
||||
|
||||
for site in sites:
|
||||
new_site(frontier, site)
|
||||
# insert in batches to avoid this error
|
||||
# rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:
|
||||
for batch in (pages[i:i+500] for i in range(0, len(pages), 500)):
|
||||
logging.info('inserting batch of %s pages', len(batch))
|
||||
result = frontier.rr.table('pages').insert(batch).run()
|
||||
for batch in (sites[i:i+100] for i in range(0, len(sites), 100)):
|
||||
logging.info('inserting batch of %s sites', len(batch))
|
||||
result = frontier.rr.table('sites').insert(batch).run()
|
||||
logging.info('job %s fully started', job.id)
|
||||
|
||||
return job
|
||||
|
||||
def new_seed_page(frontier, site):
|
||||
url = urlcanon.parse_url(site.seed)
|
||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": str(url), "site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||
"priority": 1000, "needs_robots_check": True})
|
||||
if hashtag:
|
||||
page.hashtags = [hashtag,]
|
||||
return page
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
logging.info("new site %s", site)
|
||||
site.id = site.id or str(uuid.uuid4())
|
||||
# insert the Page into the database before the Site, to avoid situation
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
try:
|
||||
url = urlcanon.parse_url(site.seed)
|
||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": str(url), "site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||
"priority": 1000, "needs_robots_check": True})
|
||||
if hashtag:
|
||||
page.hashtags = [hashtag,]
|
||||
page = new_seed_page(frontier, site)
|
||||
page.save()
|
||||
logging.info("queued page %s", page)
|
||||
finally:
|
||||
|
Loading…
x
Reference in New Issue
Block a user