diff --git a/brozzler/browser.py b/brozzler/browser.py index 981b0e4..f9093cb 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -229,6 +229,11 @@ class WebsockReceiverThread(threading.Thread): self.on_request(message) elif message['method'] == 'Debugger.paused': self._debugger_paused(message) + elif message['method'] == 'Page.interstitialShown': + # for AITFIVE-1529: handle http auth + # for now, we should consider killing the browser when we receive Page.interstitialShown and + # consider the page finished—-first we should figure out when else that event might happen + self.logger.info('Page.interstitialShown received') elif message['method'] == 'Inspector.targetCrashed': self.logger.error( '''chrome tab went "aw snap" or "he's dead jim"!''') @@ -494,13 +499,16 @@ class Browser: def configure_browser(self, extra_headers=None, user_agent=None): headers = extra_headers or {} - headers['Accept-Encoding'] = 'identity' - self.send_to_chrome( + headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch + self.websock_thread.expect_result(self._command_id.peek()) + msg_id = self.send_to_chrome( method='Network.setExtraHTTPHeaders', params={'headers': headers}) - + self._wait_for( + lambda: self.websock_thread.received_result(msg_id), + timeout=10) if user_agent: - self.send_to_chrome( + msg_id = self.send_to_chrome( method='Network.setUserAgentOverride', params={'userAgent': user_agent}) diff --git a/brozzler/cli.py b/brozzler/cli.py index c2ca647..b4f9417 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -500,6 +500,8 @@ def brozzler_list_sites(argv=None): reql = reql.between( ['ACTIVE', r.minval], ['ACTIVE', r.maxval], index='sites_last_disclaimed') + elif args.site: + reql = reql.get_all(args.site) logging.debug('querying rethinkdb: %s', reql) results = reql.run() if args.yaml: diff --git a/brozzler/worker.py b/brozzler/worker.py index 5d29eee..aec4a12 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -226,7 +226,7 @@ class BrozzlerWorker: request.set_proxy(warcprox_address, "http") try: - with urllib.request.urlopen(request) as response: + with urllib.request.urlopen(request, timeout=600) as response: if response.getcode() != 204: self.logger.warn( 'got "%s %s" response on warcprox ' diff --git a/setup.py b/setup.py index 760572e..99d524c 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev272', + version='1.1b12.dev276', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',