diff --git a/brozzler/__init__.py b/brozzler/__init__.py index bce1dee..181b797 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -30,6 +30,9 @@ class NothingToClaim(Exception): class CrawlStopped(Exception): pass +class PageInterstitialShown(Exception): + pass + class ProxyError(Exception): pass diff --git a/brozzler/browser.py b/brozzler/browser.py index a7d39d1..1ef9332 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -241,10 +241,12 @@ class WebsockReceiverThread(threading.Thread): if self.on_request: self.on_request(message) elif message['method'] == 'Page.interstitialShown': - # for AITFIVE-1529: handle http auth - # for now, we should consider killing the browser when we receive Page.interstitialShown and - # consider the page finished—-first we should figure out when else that event might happen - self.logger.info('Page.interstitialShown received') + # AITFIVE-1529: handle http auth + # we should kill the browser when we receive Page.interstitialShown and + # consider the page finished, until this is fixed: + # https://bugs.chromium.org/p/chromium/issues/detail?id=764505 + self.logger.info('Page.interstialShown (likely unsupported http auth request)') + brozzler.thread_raise(self.calling_thread, brozzler.PageInterstitialShown) elif message['method'] == 'Inspector.targetCrashed': self.logger.error( '''chrome tab went "aw snap" or "he's dead jim"!''') diff --git a/brozzler/cli.py b/brozzler/cli.py index 85f0fc4..188d591 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -198,6 +198,8 @@ def brozzle_page(argv=None): logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) + except brozzler.PageInterstitialShown as e: + logging.error('page interstitial shown %s', e) finally: browser.stop() diff --git a/brozzler/worker.py b/brozzler/worker.py index 4aad2a3..b5f38ce 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -210,9 +210,12 @@ class BrozzlerWorker: if self._needs_browsing(page, ydl_fetches): self.logger.info('needs browsing: %s', page) - browser_outlinks = self._browse_page( + try: + browser_outlinks = self._browse_page( browser, site, page, on_screenshot, on_request) - outlinks.update(browser_outlinks) + outlinks.update(browser_outlinks) + except brozzler.PageInterstitialShown: + self.logger.info('page interstitial shown (http auth): %s', page) else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 686c8f4..0efe5a3 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -62,6 +62,13 @@ def httpd(request): self.send_header('Content-Length', len(payload)) self.end_headers() self.wfile.write(payload) + elif self.path == '/401': + self.send_response(401) + self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"') + self.send_header('Content-type', 'text/html') + self.end_headers() + self.wfile.write(self.headers.getheader('Authorization')) + self.wfile.write('not authenticated') else: super().do_GET() @@ -111,6 +118,13 @@ def test_aw_snap_hes_dead_jim(): with pytest.raises(brozzler.BrowsingException): browser.browse_page('chrome://crash') +def test_page_interstitial_exception(httpd): + chrome_exe = brozzler.suggest_default_chrome_exe() + url = 'http://localhost:%s/401' % httpd.server_port + with brozzler.Browser(chrome_exe=chrome_exe) as browser: + with pytest.raises(brozzler.PageInterstitialShown): + browser.browse_page(url) + def test_on_response(httpd): response_urls = [] def on_response(msg):