mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-07 22:12:15 -04:00
honor crawl job stop requests
This commit is contained in:
parent
d2567f4a13
commit
b06381790c
7 changed files with 43 additions and 15 deletions
|
@ -10,6 +10,9 @@ class ShutdownRequested(Exception):
|
||||||
class NothingToClaim(Exception):
|
class NothingToClaim(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class CrawlJobStopped(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
class ReachedLimit(Exception):
|
class ReachedLimit(Exception):
|
||||||
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
||||||
if http_error:
|
if http_error:
|
||||||
|
|
|
@ -115,7 +115,7 @@ class RethinkDbFrontier:
|
||||||
if (site.time_limit and site.time_limit > 0
|
if (site.time_limit and site.time_limit > 0
|
||||||
and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
|
and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
|
||||||
self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
|
self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
|
||||||
site.time_limit, site.start_time, time.time() - site.start_time, site)
|
site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site)
|
||||||
self.finished(site, "FINISHED_TIME_LIMIT")
|
self.finished(site, "FINISHED_TIME_LIMIT")
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
@ -164,9 +164,18 @@ class RethinkDbFrontier:
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def honor_stop_request(self, job_id):
|
||||||
|
"""Raises brozzler.CrawlJobStopped if stop has been requested."""
|
||||||
|
job = self.job(job_id)
|
||||||
|
if job and job.stop_requested:
|
||||||
|
self.logger.info("stop requested for job %s", job_id)
|
||||||
|
raise brozzler.CrawlJobStopped
|
||||||
|
|
||||||
def _maybe_finish_job(self, job_id):
|
def _maybe_finish_job(self, job_id):
|
||||||
"""Returns True if job is finished."""
|
"""Returns True if job is finished."""
|
||||||
job = self.job(job_id)
|
job = self.job(job_id)
|
||||||
|
if not job:
|
||||||
|
return False
|
||||||
if job.status.startswith("FINISH"):
|
if job.status.startswith("FINISH"):
|
||||||
self.logger.warn("%s is already %s", job, job.status)
|
self.logger.warn("%s is already %s", job, job.status)
|
||||||
return True
|
return True
|
||||||
|
@ -182,12 +191,12 @@ class RethinkDbFrontier:
|
||||||
|
|
||||||
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
|
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
|
||||||
job.status = "FINISHED"
|
job.status = "FINISHED"
|
||||||
job.finished = rethinkdb.utcnow()
|
job.finished = rethinkstuff.utcnow()
|
||||||
self.update_job(job)
|
self.update_job(job)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def finished(self, site, status):
|
def finished(self, site, status):
|
||||||
self.logger.info("%s %s", site, status)
|
self.logger.info("%s %s", status, site)
|
||||||
site.status = status
|
site.status = status
|
||||||
self.update_site(site)
|
self.update_site(site)
|
||||||
if site.job_id:
|
if site.job_id:
|
||||||
|
@ -211,7 +220,10 @@ class RethinkDbFrontier:
|
||||||
for url in outlinks:
|
for url in outlinks:
|
||||||
if site.is_in_scope(url, parent_page):
|
if site.is_in_scope(url, parent_page):
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, url):
|
||||||
new_child_page = brozzler.Page(url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed+1, via_page_id=parent_page.id)
|
new_child_page = brozzler.Page(
|
||||||
|
url, site_id=site.id, job_id=site.job_id,
|
||||||
|
hops_from_seed=parent_page.hops_from_seed+1,
|
||||||
|
via_page_id=parent_page.id)
|
||||||
existing_child_page = self.page(new_child_page.id)
|
existing_child_page = self.page(new_child_page.id)
|
||||||
if existing_child_page:
|
if existing_child_page:
|
||||||
existing_child_page.priority += new_child_page.priority
|
existing_child_page.priority += new_child_page.priority
|
||||||
|
|
|
@ -79,10 +79,12 @@ def new_site(frontier, site):
|
||||||
class Job(brozzler.BaseDictable):
|
class Job(brozzler.BaseDictable):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, id=None, conf=None, status="ACTIVE", started=None, finished=None):
|
def __init__(self, id=None, conf=None, status="ACTIVE", started=None,
|
||||||
|
finished=None, stop_requested=None):
|
||||||
self.id = id
|
self.id = id
|
||||||
self.conf = conf
|
self.conf = conf
|
||||||
self.status = status
|
self.status = status
|
||||||
self.started = started
|
self.started = started
|
||||||
self.finished = finished
|
self.finished = finished
|
||||||
|
self.stop_requested = stop_requested
|
||||||
|
|
||||||
|
|
|
@ -109,8 +109,10 @@ class BrozzlerWorker:
|
||||||
def brozzle_page(self, browser, ydl, site, page):
|
def brozzle_page(self, browser, ydl, site, page):
|
||||||
def on_screenshot(screenshot_png):
|
def on_screenshot(screenshot_png):
|
||||||
if site.proxy and site.enable_warcprox_features:
|
if site.proxy and site.enable_warcprox_features:
|
||||||
self.logger.info("sending WARCPROX_WRITE_RECORD request to warcprox with screenshot for %s", page)
|
self.logger.info("sending WARCPROX_WRITE_RECORD request "
|
||||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(screenshot_png)
|
"to warcprox with screenshot for %s", page)
|
||||||
|
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||||
|
screenshot_png)
|
||||||
self._warcprox_write_record(warcprox_address=site.proxy,
|
self._warcprox_write_record(warcprox_address=site.proxy,
|
||||||
url="screenshot:{}".format(page.url),
|
url="screenshot:{}".format(page.url),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
|
@ -128,20 +130,23 @@ class BrozzlerWorker:
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
raise
|
raise
|
||||||
except:
|
except:
|
||||||
self.logger.error("youtube_dl raised exception on {}".format(page), exc_info=True)
|
self.logger.error("youtube_dl raised exception on %s",
|
||||||
|
page, exc_info=True)
|
||||||
|
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
browser.start(proxy=site.proxy)
|
browser.start(proxy=site.proxy)
|
||||||
outlinks = browser.browse_page(page.url,
|
outlinks = browser.browse_page(
|
||||||
extra_headers=site.extra_headers, on_screenshot=on_screenshot,
|
page.url, extra_headers=site.extra_headers,
|
||||||
on_url_change=page.note_redirect)
|
on_screenshot=on_screenshot, on_url_change=page.note_redirect)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _brozzle_site(self, browser, ydl, site):
|
def _brozzle_site(self, browser, ydl, site):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
page = None
|
page = None
|
||||||
try:
|
try:
|
||||||
while not self._shutdown_requested.is_set() and time.time() - start < 7 * 60:
|
while (not self._shutdown_requested.is_set()
|
||||||
|
and time.time() - start < 7 * 60):
|
||||||
|
self._frontier.honor_stop_request(site.job_id)
|
||||||
page = self._frontier.claim_page(site, self._id)
|
page = self._frontier.claim_page(site, self._id)
|
||||||
outlinks = self.brozzle_page(browser, ydl, site, page)
|
outlinks = self.brozzle_page(browser, ydl, site, page)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
|
@ -151,12 +156,15 @@ class BrozzlerWorker:
|
||||||
self.logger.info("no pages left for site %s", site)
|
self.logger.info("no pages left for site %s", site)
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
self._frontier.reached_limit(site, e)
|
self._frontier.reached_limit(site, e)
|
||||||
|
except brozzler.CrawlJobStopped:
|
||||||
|
self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
|
||||||
except brozzler.browser.BrowsingAborted:
|
except brozzler.browser.BrowsingAborted:
|
||||||
self.logger.info("{} shut down".format(browser))
|
self.logger.info("{} shut down".format(browser))
|
||||||
except:
|
except:
|
||||||
self.logger.critical("unexpected exception", exc_info=True)
|
self.logger.critical("unexpected exception", exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
self.logger.info("finished session brozzling site, stopping browser and disclaiming site")
|
self.logger.info("finished session brozzling site, stopping "
|
||||||
|
"browser and disclaiming site")
|
||||||
browser.stop()
|
browser.stop()
|
||||||
self._frontier.disclaim_site(site, page)
|
self._frontier.disclaim_site(site, page)
|
||||||
self._browser_pool.release(browser)
|
self._browser_pool.release(browser)
|
||||||
|
|
|
@ -22,5 +22,5 @@ ADD vnc-websock.sh /etc/service/vnc-websock/run
|
||||||
EXPOSE 5901 8901
|
EXPOSE 5901 8901
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
|
|
||||||
RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git
|
# RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git
|
||||||
|
|
||||||
|
|
|
@ -2,3 +2,6 @@ Chromium seemed to be dying more often when running in a docker container.
|
||||||
|
|
||||||
To start the services brozzler-worker depends on:
|
To start the services brozzler-worker depends on:
|
||||||
/home/nlevitt/workspace/brozzler/no-docker/vncserver.sh & /home/nlevitt/workspace/brozzler/no-docker/vnc-websock.sh &
|
/home/nlevitt/workspace/brozzler/no-docker/vncserver.sh & /home/nlevitt/workspace/brozzler/no-docker/vnc-websock.sh &
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
apt-get -y install vnc4server chromium-browser xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala python3-pip git libjpeg-turbo8-dev zlib1g-dev
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -2,7 +2,7 @@ import setuptools
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(name='brozzler',
|
||||||
version='1.0.1',
|
version='1.0.2',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue