mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-07 14:02:24 -04:00
honor crawl job stop requests
This commit is contained in:
parent
d2567f4a13
commit
b06381790c
7 changed files with 43 additions and 15 deletions
|
@ -10,6 +10,9 @@ class ShutdownRequested(Exception):
|
|||
class NothingToClaim(Exception):
|
||||
pass
|
||||
|
||||
class CrawlJobStopped(Exception):
|
||||
pass
|
||||
|
||||
class ReachedLimit(Exception):
|
||||
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
||||
if http_error:
|
||||
|
|
|
@ -115,7 +115,7 @@ class RethinkDbFrontier:
|
|||
if (site.time_limit and site.time_limit > 0
|
||||
and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
|
||||
self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
|
||||
site.time_limit, site.start_time, time.time() - site.start_time, site)
|
||||
site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site)
|
||||
self.finished(site, "FINISHED_TIME_LIMIT")
|
||||
return True
|
||||
else:
|
||||
|
@ -164,9 +164,18 @@ class RethinkDbFrontier:
|
|||
else:
|
||||
return None
|
||||
|
||||
def honor_stop_request(self, job_id):
|
||||
"""Raises brozzler.CrawlJobStopped if stop has been requested."""
|
||||
job = self.job(job_id)
|
||||
if job and job.stop_requested:
|
||||
self.logger.info("stop requested for job %s", job_id)
|
||||
raise brozzler.CrawlJobStopped
|
||||
|
||||
def _maybe_finish_job(self, job_id):
|
||||
"""Returns True if job is finished."""
|
||||
job = self.job(job_id)
|
||||
if not job:
|
||||
return False
|
||||
if job.status.startswith("FINISH"):
|
||||
self.logger.warn("%s is already %s", job, job.status)
|
||||
return True
|
||||
|
@ -182,12 +191,12 @@ class RethinkDbFrontier:
|
|||
|
||||
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
|
||||
job.status = "FINISHED"
|
||||
job.finished = rethinkdb.utcnow()
|
||||
job.finished = rethinkstuff.utcnow()
|
||||
self.update_job(job)
|
||||
return True
|
||||
|
||||
def finished(self, site, status):
|
||||
self.logger.info("%s %s", site, status)
|
||||
self.logger.info("%s %s", status, site)
|
||||
site.status = status
|
||||
self.update_site(site)
|
||||
if site.job_id:
|
||||
|
@ -211,7 +220,10 @@ class RethinkDbFrontier:
|
|||
for url in outlinks:
|
||||
if site.is_in_scope(url, parent_page):
|
||||
if brozzler.is_permitted_by_robots(site, url):
|
||||
new_child_page = brozzler.Page(url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed+1, via_page_id=parent_page.id)
|
||||
new_child_page = brozzler.Page(
|
||||
url, site_id=site.id, job_id=site.job_id,
|
||||
hops_from_seed=parent_page.hops_from_seed+1,
|
||||
via_page_id=parent_page.id)
|
||||
existing_child_page = self.page(new_child_page.id)
|
||||
if existing_child_page:
|
||||
existing_child_page.priority += new_child_page.priority
|
||||
|
|
|
@ -79,10 +79,12 @@ def new_site(frontier, site):
|
|||
class Job(brozzler.BaseDictable):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, id=None, conf=None, status="ACTIVE", started=None, finished=None):
|
||||
def __init__(self, id=None, conf=None, status="ACTIVE", started=None,
|
||||
finished=None, stop_requested=None):
|
||||
self.id = id
|
||||
self.conf = conf
|
||||
self.status = status
|
||||
self.started = started
|
||||
self.finished = finished
|
||||
self.stop_requested = stop_requested
|
||||
|
||||
|
|
|
@ -109,8 +109,10 @@ class BrozzlerWorker:
|
|||
def brozzle_page(self, browser, ydl, site, page):
|
||||
def on_screenshot(screenshot_png):
|
||||
if site.proxy and site.enable_warcprox_features:
|
||||
self.logger.info("sending WARCPROX_WRITE_RECORD request to warcprox with screenshot for %s", page)
|
||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(screenshot_png)
|
||||
self.logger.info("sending WARCPROX_WRITE_RECORD request "
|
||||
"to warcprox with screenshot for %s", page)
|
||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||
screenshot_png)
|
||||
self._warcprox_write_record(warcprox_address=site.proxy,
|
||||
url="screenshot:{}".format(page.url),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
|
@ -128,20 +130,23 @@ class BrozzlerWorker:
|
|||
except brozzler.ReachedLimit as e:
|
||||
raise
|
||||
except:
|
||||
self.logger.error("youtube_dl raised exception on {}".format(page), exc_info=True)
|
||||
self.logger.error("youtube_dl raised exception on %s",
|
||||
page, exc_info=True)
|
||||
|
||||
if not browser.is_running():
|
||||
browser.start(proxy=site.proxy)
|
||||
outlinks = browser.browse_page(page.url,
|
||||
extra_headers=site.extra_headers, on_screenshot=on_screenshot,
|
||||
on_url_change=page.note_redirect)
|
||||
outlinks = browser.browse_page(
|
||||
page.url, extra_headers=site.extra_headers,
|
||||
on_screenshot=on_screenshot, on_url_change=page.note_redirect)
|
||||
return outlinks
|
||||
|
||||
def _brozzle_site(self, browser, ydl, site):
|
||||
start = time.time()
|
||||
page = None
|
||||
try:
|
||||
while not self._shutdown_requested.is_set() and time.time() - start < 7 * 60:
|
||||
while (not self._shutdown_requested.is_set()
|
||||
and time.time() - start < 7 * 60):
|
||||
self._frontier.honor_stop_request(site.job_id)
|
||||
page = self._frontier.claim_page(site, self._id)
|
||||
outlinks = self.brozzle_page(browser, ydl, site, page)
|
||||
self._frontier.completed_page(site, page)
|
||||
|
@ -151,12 +156,15 @@ class BrozzlerWorker:
|
|||
self.logger.info("no pages left for site %s", site)
|
||||
except brozzler.ReachedLimit as e:
|
||||
self._frontier.reached_limit(site, e)
|
||||
except brozzler.CrawlJobStopped:
|
||||
self._frontier.finished(site, "FINISHED_STOP_REQUESTED")
|
||||
except brozzler.browser.BrowsingAborted:
|
||||
self.logger.info("{} shut down".format(browser))
|
||||
except:
|
||||
self.logger.critical("unexpected exception", exc_info=True)
|
||||
finally:
|
||||
self.logger.info("finished session brozzling site, stopping browser and disclaiming site")
|
||||
self.logger.info("finished session brozzling site, stopping "
|
||||
"browser and disclaiming site")
|
||||
browser.stop()
|
||||
self._frontier.disclaim_site(site, page)
|
||||
self._browser_pool.release(browser)
|
||||
|
|
|
@ -22,5 +22,5 @@ ADD vnc-websock.sh /etc/service/vnc-websock/run
|
|||
EXPOSE 5901 8901
|
||||
EXPOSE 8080
|
||||
|
||||
RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git
|
||||
# RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git
|
||||
|
||||
|
|
|
@ -2,3 +2,6 @@ Chromium seemed to be dying more often when running in a docker container.
|
|||
|
||||
To start the services brozzler-worker depends on:
|
||||
/home/nlevitt/workspace/brozzler/no-docker/vncserver.sh & /home/nlevitt/workspace/brozzler/no-docker/vnc-websock.sh &
|
||||
|
||||
Prerequisites:
|
||||
apt-get -y install vnc4server chromium-browser xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala python3-pip git libjpeg-turbo8-dev zlib1g-dev
|
||||
|
|
2
setup.py
2
setup.py
|
@ -2,7 +2,7 @@ import setuptools
|
|||
import glob
|
||||
|
||||
setuptools.setup(name='brozzler',
|
||||
version='1.0.1',
|
||||
version='1.0.2',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/nlevitt/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue