diff --git a/README.rst b/README.rst index 94f3262..7d43a64 100644 --- a/README.rst +++ b/README.rst @@ -70,6 +70,14 @@ must be specified, everything else is optional. scope: surt: http://(org,example, +Submit a Site to Crawl Without Configuring a Job +------------------------------------------------ + +:: + + brozzler-new-site --proxy=localhost:8000 --enable-warcprox-features \ + --time-limit=600 http://example.com/ + Brozzler Web Console -------------------- diff --git a/brozzler/__init__.py b/brozzler/__init__.py index b0890e0..22a4a6e 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -64,6 +64,9 @@ class BaseDictable: def __repr__(self): return "{}(**{})".format(self.__class__.__name__, self.to_dict()) +# logging level more fine-grained than logging.DEBUG==10 +TRACE = 5 + from brozzler.site import Page, Site from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots diff --git a/brozzler/browser.py b/brozzler/browser.py index 02b7f3c..bde1376 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -55,8 +55,6 @@ class BrowserPool: self._lock = threading.Lock() - self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available])) - def acquire(self): """ Returns browser from pool if available, raises NoBrowsersAvailable @@ -133,13 +131,18 @@ class Browser: # these can raise exceptions self.chrome_port = self._find_available_port() self._work_dir = tempfile.TemporaryDirectory() - self._chrome_instance = Chrome(port=self.chrome_port, - executable=self.chrome_exe, + self._chrome_instance = Chrome( + port=self.chrome_port, executable=self.chrome_exe, user_home_dir=self._work_dir.name, - user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]), + user_data_dir=os.sep.join([ + self._work_dir.name, "chrome-user-data"]), ignore_cert_errors=self.ignore_cert_errors, proxy=proxy or self.proxy) - self._websocket_url = self._chrome_instance.start() + try: + self._websocket_url = self._chrome_instance.start() + except: + self._chrome_instance = None + raise def stop(self): try: @@ -201,6 +204,7 @@ class Browser: self.behavior_parameters = behavior_parameters self._waiting_on_scroll_to_top_msg_id = None + self._waiting_on_scroll_to_top_start = None self._waiting_on_screenshot_msg_id = None self._waiting_on_document_url_msg_id = None self._waiting_on_outlinks_msg_id = None @@ -267,7 +271,19 @@ class Browser: self._waiting_on_scroll_to_top_msg_id = self.send_to_chrome( method="Runtime.evaluate", params={"expression":"window.scrollTo(0, 0);"}) + self._waiting_on_scroll_to_top_start = time.time() return False + elif (self._waiting_on_scroll_to_top_msg_id + and time.time() - self._waiting_on_scroll_to_top_start > 30): + # chromium bug? occasionally we get no scroll-to-top result message + self.logger.warn( + "timed out after %.1fs waiting for scroll-to-top result " + "message, requesting screenshot now", + time.time() - self._waiting_on_scroll_to_top_start) + self._waiting_on_scroll_to_top_msg_id = None + self._waiting_on_scroll_to_top_start = None + self._waiting_on_screenshot_msg_id = self.send_to_chrome( + method="Page.captureScreenshot") elif not self._has_screenshot and ( self._waiting_on_scroll_to_top_msg_id or self._waiting_on_screenshot_msg_id): @@ -406,8 +422,10 @@ compileOutlinks(window).join(' '); self._has_screenshot = True self.logger.info("got screenshot, moving on to getting outlinks url={}".format(self.url)) elif message["id"] == self._waiting_on_scroll_to_top_msg_id: - self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot") self._waiting_on_scroll_to_top_msg_id = None + self._waiting_on_scroll_to_top_start = None + self._waiting_on_screenshot_msg_id = self.send_to_chrome( + method="Page.captureScreenshot") elif message["id"] == self._waiting_on_outlinks_msg_id: self.logger.debug("got outlinks message=%s", message) self._outlinks = frozenset( diff --git a/brozzler/cli.py b/brozzler/cli.py index 86d1a65..9d4cca4 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -41,6 +41,9 @@ def _add_common_options(arg_parser): arg_parser.add_argument( '-v', '--verbose', dest='log_level', action='store_const', default=logging.INFO, const=logging.DEBUG) + arg_parser.add_argument( + '--trace', dest='log_level', + action='store_const', default=logging.INFO, const=brozzler.TRACE) arg_parser.add_argument( '--version', action='version', version='brozzler %s - %s' % ( diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 7df5092..c6185b8 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -197,12 +197,12 @@ class RethinkDbFrontier: 1).update({ "claimed":True, "last_claimed_by":worker_id}, - return_changes=True).run() - self._vet_result(result, replaced=[0,1]) - if result["replaced"] == 1: - return brozzler.Page(**result["changes"][0]["new_val"]) - else: + return_changes="always").run() + self._vet_result(result, unchanged=[0,1], replaced=[0,1]) + if result["unchanged"] == 0 and result["replaced"] == 0: raise brozzler.NothingToClaim + else: + return brozzler.Page(**result["changes"][0]["new_val"]) def has_outstanding_pages(self, site): results_iter = self.r.table("pages").between( @@ -233,6 +233,8 @@ class RethinkDbFrontier: yield brozzler.Job(**result) def job(self, id): + if id is None: + return None result = self.r.table("jobs").get(id).run() if result: return brozzler.Job(**result) diff --git a/brozzler/worker.py b/brozzler/worker.py index dc1fc7b..c1415b7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -225,6 +225,7 @@ class BrozzlerWorker: raise except Exception as e: if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 + and hasattr(e.exc_info[1], 'code') and e.exc_info[1].code == 430): self.logger.info( 'youtube-dl got %s %s processing %s', @@ -288,9 +289,8 @@ class BrozzlerWorker: while (not self._shutdown_requested.is_set() and time.time() - start < 7 * 60): self._frontier.honor_stop_request(site.job_id) - page = self._frontier.claim_page(site, - "{}:{}".format( - socket.gethostname(), browser.chrome_port)) + page = self._frontier.claim_page(site, "%s:%s" % ( + socket.gethostname(), browser.chrome_port)) outlinks = self.brozzle_page(browser, site, page) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks(site, page, outlinks) @@ -326,8 +326,9 @@ class BrozzlerWorker: try: self.status_info = self._service_registry.heartbeat(status_info) - self.logger.debug( - "status in service registry: %s", self.status_info) + self.logger.log( + brozzler.TRACE, "status in service registry: %s", + self.status_info) except rethinkdb.ReqlError as e: self.logger.error( "failed to send heartbeat and update service registry " @@ -337,7 +338,11 @@ class BrozzlerWorker: try: latest_state = None while not self._shutdown_requested.is_set(): - if self._service_registry and (not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL): + if self._service_registry and ( + not hasattr(self, "status_info") + or (rethinkstuff.utcnow() - + self.status_info["last_heartbeat"]).total_seconds() + > self.HEARTBEAT_INTERVAL): self._service_heartbeat() try: @@ -357,7 +362,8 @@ class BrozzlerWorker: raise except brozzler.browser.NoBrowsersAvailable: if latest_state != "browsers-busy": - self.logger.info("all %s browsers are busy", self._max_browsers) + self.logger.info( + "all %s browsers are busy", self._max_browsers) latest_state = "browsers-busy" except brozzler.NothingToClaim: if latest_state != "no-unclaimed-sites": diff --git a/setup.py b/setup.py index dc2ac7d..93d3cfb 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1.dev31', + version='1.1.dev38', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',