honor crawl job stop requests

This commit is contained in:
Noah Levitt 2016-03-08 00:18:54 +00:00
parent d2567f4a13
commit b06381790c
7 changed files with 43 additions and 15 deletions

View file

@ -10,6 +10,9 @@ class ShutdownRequested(Exception):
class NothingToClaim(Exception):
pass
class CrawlJobStopped(Exception):
pass
class ReachedLimit(Exception):
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
if http_error:

View file

@ -115,7 +115,7 @@ class RethinkDbFrontier:
if (site.time_limit and site.time_limit > 0
and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
site.time_limit, site.start_time, time.time() - site.start_time, site)
site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site)
self.finished(site, "FINISHED_TIME_LIMIT")
return True
else:
@ -164,9 +164,18 @@ class RethinkDbFrontier:
else:
return None
def honor_stop_request(self, job_id):
"""Raises brozzler.CrawlJobStopped if stop has been requested."""
job = self.job(job_id)
if job and job.stop_requested:
self.logger.info("stop requested for job %s", job_id)
raise brozzler.CrawlJobStopped
def _maybe_finish_job(self, job_id):
"""Returns True if job is finished."""
job = self.job(job_id)
if not job:
return False
if job.status.startswith("FINISH"):
self.logger.warn("%s is already %s", job, job.status)
return True
@ -182,12 +191,12 @@ class RethinkDbFrontier:
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
job.status = "FINISHED"
job.finished = rethinkdb.utcnow()
job.finished = rethinkstuff.utcnow()
self.update_job(job)
return True
def finished(self, site, status):
self.logger.info("%s %s", site, status)
self.logger.info("%s %s", status, site)
site.status = status
self.update_site(site)
if site.job_id:
@ -211,7 +220,10 @@ class RethinkDbFrontier:
for url in outlinks:
if site.is_in_scope(url, parent_page):
if brozzler.is_permitted_by_robots(site, url):
new_child_page = brozzler.Page(url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed+1, via_page_id=parent_page.id)
new_child_page = brozzler.Page(
url, site_id=site.id, job_id=site.job_id,
hops_from_seed=parent_page.hops_from_seed+1,
via_page_id=parent_page.id)
existing_child_page = self.page(new_child_page.id)
if existing_child_page:
existing_child_page.priority += new_child_page.priority

View file

@ -79,10 +79,12 @@ def new_site(frontier, site):
class Job(brozzler.BaseDictable):
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, id=None, conf=None, status="ACTIVE", started=None, finished=None):
def __init__(self, id=None, conf=None, status="ACTIVE", started=None,
finished=None, stop_requested=None):
self.id = id
self.conf = conf
self.status = status
self.started = started
self.finished = finished
self.stop_requested = stop_requested

View file

@ -109,8 +109,10 @@ class BrozzlerWorker:
def brozzle_page(self, browser, ydl, site, page):
def on_screenshot(screenshot_png):
if site.proxy and site.enable_warcprox_features:
self.logger.info("sending WARCPROX_WRITE_RECORD request to warcprox with screenshot for %s", page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(screenshot_png)
self.logger.info("sending WARCPROX_WRITE_RECORD request "
"to warcprox with screenshot for %s", page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
screenshot_png)
self._warcprox_write_record(warcprox_address=site.proxy,
url="screenshot:{}".format(page.url),
warc_type="resource", content_type="image/jpeg",
@ -128,20 +130,23 @@ class BrozzlerWorker:
except brozzler.ReachedLimit as e:
raise
except:
self.logger.error("youtube_dl raised exception on {}".format(page), exc_info=True)
self.logger.error("youtube_dl raised exception on %s",
page, exc_info=True)
if not browser.is_running():
browser.start(proxy=site.proxy)
outlinks = browser.browse_page(page.url,
extra_headers=site.extra_headers, on_screenshot=on_screenshot,
on_url_change=page.note_redirect)
outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers,
on_screenshot=on_screenshot, on_url_change=page.note_redirect)
return outlinks
def _brozzle_site(self, browser, ydl, site):
start = time.time()
page = None
try:
while not self._shutdown_requested.is_set() and time.time() - start < 7 * 60:
while (not self._shutdown_requested.is_set()
and time.time() - start < 7 * 60):
self._frontier.honor_stop_request(site.job_id)
page = self._frontier.claim_page(site, self._id)
outlinks = self.brozzle_page(browser, ydl, site, page)
self._frontier.completed_page(site, page)
@ -151,12 +156,15 @@ class BrozzlerWorker:
self.logger.info("no pages left for site %s", site)
except brozzler.ReachedLimit as e:
self._frontier.reached_limit(site, e)
except brozzler.CrawlJobStopped:
self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
except brozzler.browser.BrowsingAborted:
self.logger.info("{} shut down".format(browser))
except:
self.logger.critical("unexpected exception", exc_info=True)
finally:
self.logger.info("finished session brozzling site, stopping browser and disclaiming site")
self.logger.info("finished session brozzling site, stopping "
"browser and disclaiming site")
browser.stop()
self._frontier.disclaim_site(site, page)
self._browser_pool.release(browser)

View file

@ -22,5 +22,5 @@ ADD vnc-websock.sh /etc/service/vnc-websock/run
EXPOSE 5901 8901
EXPOSE 8080
RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git
# RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git

View file

@ -2,3 +2,6 @@ Chromium seemed to be dying more often when running in a docker container.
To start the services brozzler-worker depends on:
/home/nlevitt/workspace/brozzler/no-docker/vncserver.sh & /home/nlevitt/workspace/brozzler/no-docker/vnc-websock.sh &
Prerequisites:
apt-get -y install vnc4server chromium-browser xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala python3-pip git libjpeg-turbo8-dev zlib1g-dev

View file

@ -2,7 +2,7 @@ import setuptools
import glob
setuptools.setup(name='brozzler',
version='1.0.1',
version='1.0.2',
description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler',
author='Noah Levitt',