diff --git a/brozzler/__init__.py b/brozzler/__init__.py index d79328f..2990f1c 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -10,6 +10,9 @@ class ShutdownRequested(Exception): class NothingToClaim(Exception): pass +class CrawlJobStopped(Exception): + pass + class ReachedLimit(Exception): def __init__(self, http_error=None, warcprox_meta=None, http_payload=None): if http_error: diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 3b3c8f9..3608d06 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -115,7 +115,7 @@ class RethinkDbFrontier: if (site.time_limit and site.time_limit > 0 and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit): self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s", - site.time_limit, site.start_time, time.time() - site.start_time, site) + site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site) self.finished(site, "FINISHED_TIME_LIMIT") return True else: @@ -164,9 +164,18 @@ class RethinkDbFrontier: else: return None + def honor_stop_request(self, job_id): + """Raises brozzler.CrawlJobStopped if stop has been requested.""" + job = self.job(job_id) + if job and job.stop_requested: + self.logger.info("stop requested for job %s", job_id) + raise brozzler.CrawlJobStopped + def _maybe_finish_job(self, job_id): """Returns True if job is finished.""" job = self.job(job_id) + if not job: + return False if job.status.startswith("FINISH"): self.logger.warn("%s is already %s", job, job.status) return True @@ -182,12 +191,12 @@ class RethinkDbFrontier: self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id) job.status = "FINISHED" - job.finished = rethinkdb.utcnow() + job.finished = rethinkstuff.utcnow() self.update_job(job) return True def finished(self, site, status): - self.logger.info("%s %s", site, status) + self.logger.info("%s %s", status, site) site.status = status self.update_site(site) if site.job_id: @@ -211,7 +220,10 @@ class RethinkDbFrontier: for url in outlinks: if site.is_in_scope(url, parent_page): if brozzler.is_permitted_by_robots(site, url): - new_child_page = brozzler.Page(url, site_id=site.id, job_id=site.job_id, hops_from_seed=parent_page.hops_from_seed+1, via_page_id=parent_page.id) + new_child_page = brozzler.Page( + url, site_id=site.id, job_id=site.job_id, + hops_from_seed=parent_page.hops_from_seed+1, + via_page_id=parent_page.id) existing_child_page = self.page(new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority diff --git a/brozzler/job.py b/brozzler/job.py index ff6858a..4f67232 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -79,10 +79,12 @@ def new_site(frontier, site): class Job(brozzler.BaseDictable): logger = logging.getLogger(__module__ + "." + __qualname__) - def __init__(self, id=None, conf=None, status="ACTIVE", started=None, finished=None): + def __init__(self, id=None, conf=None, status="ACTIVE", started=None, + finished=None, stop_requested=None): self.id = id self.conf = conf self.status = status self.started = started self.finished = finished + self.stop_requested = stop_requested diff --git a/brozzler/worker.py b/brozzler/worker.py index a0f765d..2ef0f4a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -109,8 +109,10 @@ class BrozzlerWorker: def brozzle_page(self, browser, ydl, site, page): def on_screenshot(screenshot_png): if site.proxy and site.enable_warcprox_features: - self.logger.info("sending WARCPROX_WRITE_RECORD request to warcprox with screenshot for %s", page) - screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(screenshot_png) + self.logger.info("sending WARCPROX_WRITE_RECORD request " + "to warcprox with screenshot for %s", page) + screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( + screenshot_png) self._warcprox_write_record(warcprox_address=site.proxy, url="screenshot:{}".format(page.url), warc_type="resource", content_type="image/jpeg", @@ -128,20 +130,23 @@ class BrozzlerWorker: except brozzler.ReachedLimit as e: raise except: - self.logger.error("youtube_dl raised exception on {}".format(page), exc_info=True) + self.logger.error("youtube_dl raised exception on %s", + page, exc_info=True) if not browser.is_running(): browser.start(proxy=site.proxy) - outlinks = browser.browse_page(page.url, - extra_headers=site.extra_headers, on_screenshot=on_screenshot, - on_url_change=page.note_redirect) + outlinks = browser.browse_page( + page.url, extra_headers=site.extra_headers, + on_screenshot=on_screenshot, on_url_change=page.note_redirect) return outlinks def _brozzle_site(self, browser, ydl, site): start = time.time() page = None try: - while not self._shutdown_requested.is_set() and time.time() - start < 7 * 60: + while (not self._shutdown_requested.is_set() + and time.time() - start < 7 * 60): + self._frontier.honor_stop_request(site.job_id) page = self._frontier.claim_page(site, self._id) outlinks = self.brozzle_page(browser, ydl, site, page) self._frontier.completed_page(site, page) @@ -151,12 +156,15 @@ class BrozzlerWorker: self.logger.info("no pages left for site %s", site) except brozzler.ReachedLimit as e: self._frontier.reached_limit(site, e) + except brozzler.CrawlJobStopped: + self._frontier.finished(site, "FINISHED_STOP_REQUESTED") except brozzler.browser.BrowsingAborted: self.logger.info("{} shut down".format(browser)) except: self.logger.critical("unexpected exception", exc_info=True) finally: - self.logger.info("finished session brozzling site, stopping browser and disclaiming site") + self.logger.info("finished session brozzling site, stopping " + "browser and disclaiming site") browser.stop() self._frontier.disclaim_site(site, page) self._browser_pool.release(browser) diff --git a/docker/Dockerfile b/docker/Dockerfile index 421090b..a8ab831 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -22,5 +22,5 @@ ADD vnc-websock.sh /etc/service/vnc-websock/run EXPOSE 5901 8901 EXPOSE 8080 -RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git +# RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git diff --git a/no-docker/README.rst b/no-docker/README.rst index dd8c174..b863649 100644 --- a/no-docker/README.rst +++ b/no-docker/README.rst @@ -2,3 +2,6 @@ Chromium seemed to be dying more often when running in a docker container. To start the services brozzler-worker depends on: /home/nlevitt/workspace/brozzler/no-docker/vncserver.sh & /home/nlevitt/workspace/brozzler/no-docker/vnc-websock.sh & + +Prerequisites: +apt-get -y install vnc4server chromium-browser xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala python3-pip git libjpeg-turbo8-dev zlib1g-dev diff --git a/setup.py b/setup.py index 8ad280f..dd5cecc 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ import setuptools import glob setuptools.setup(name='brozzler', - version='1.0.1', + version='1.0.2', description='Distributed web crawling with browsers', url='https://github.com/nlevitt/brozzler', author='Noah Levitt',