diff --git a/brozzler/browser.py b/brozzler/browser.py index fa31b2e..000ed8c 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -152,6 +152,7 @@ class WebsockReceiverThread(threading.Thread): self.on_request = None self.on_response = None + self.on_service_worker_version_updated = None self._result_messages = {} @@ -261,6 +262,9 @@ class WebsockReceiverThread(threading.Thread): and 'params' in message and 'errorText' in message['params'] and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'): brozzler.thread_raise(self.calling_thread, brozzler.ProxyError) + elif message['method'] == 'ServiceWorker.workerVersionUpdated': + if self.on_service_worker_version_updated: + self.on_service_worker_version_updated(message) # else: # self.logger.debug("%s %s", message["method"], json_message) elif 'result' in message: @@ -345,13 +349,14 @@ class Browser: self.send_to_chrome(method='Page.enable') self.send_to_chrome(method='Console.enable') self.send_to_chrome(method='Runtime.enable') + self.send_to_chrome(method='ServiceWorker.enable') + self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad') # disable google analytics self.send_to_chrome( method='Network.setBlockedURLs', params={'urls': ['*google-analytics.com/analytics.js', - '*google-analytics.com/ga.js']} - ) + '*google-analytics.com/ga.js']}) def stop(self): ''' @@ -395,7 +400,8 @@ class Browser: def browse_page( self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, behaviors_dir=None, - on_request=None, on_response=None, on_screenshot=None, + on_request=None, on_response=None, + on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): @@ -422,10 +428,18 @@ class Browser: on_response: callback to invoke on every Network.responseReceived event, takes one argument, the json-decoded message (default None) + on_service_worker_version_updated: callback to invoke on every + ServiceWorker.workerVersionUpdated event, takes one argument, + the json-decoded message (default None) on_screenshot: callback to invoke when screenshot is obtained, takes one argument, the the raw jpeg bytes (default None) # XXX takes two arguments, the url of the page at the time the # screenshot was taken, and the raw jpeg bytes (default None) + username: username string to use to try logging in if a login form + is found in the page (default None) + password: password string to use to try logging in if a login form + is found in the page (default None) + ... (there are more) Returns: A tuple (final_page_url, outlinks). @@ -448,6 +462,9 @@ class Browser: self.websock_thread.on_request = on_request if on_response: self.websock_thread.on_response = on_response + if on_service_worker_version_updated: + self.websock_thread.on_service_worker_version_updated = \ + on_service_worker_version_updated try: with brozzler.thread_accept_exceptions(): self.configure_browser( diff --git a/brozzler/worker.py b/brozzler/worker.py index b5f38ce..05d260b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -219,7 +219,7 @@ class BrozzlerWorker: else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) - self._fetch_url(site, page) + self._fetch_url(site, page.url) else: self.logger.info('already fetched: %s', page) @@ -274,6 +274,16 @@ class BrozzlerWorker: page.videos = [] page.videos.append(video) + sw_fetched = set() + def _on_service_worker_version_updated(chrome_msg): + # https://github.com/internetarchive/brozzler/issues/140 + self.logger.trace('%r', chrome_msg) + url = chrome_msg.get('params', {}).get('versions', [{}])[0].get('scriptURL') + if url not in sw_fetched: + self.logger.info('fetching service worker script %s', url) + self._fetch_url(site, url) + sw_fetched.add(url) + if not browser.is_running(): browser.start( proxy=self._proxy_for(site), @@ -284,7 +294,9 @@ class BrozzlerWorker: username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), on_screenshot=_on_screenshot, on_response=_on_response, - on_request=on_request, hashtags=page.hashtags, + on_request=on_request, + on_service_worker_version_updated=_on_service_worker_version_updated, + hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, @@ -294,7 +306,7 @@ class BrozzlerWorker: page.note_redirect(final_page_url) return outlinks - def _fetch_url(self, site, page): + def _fetch_url(self, site, url): proxies = None if self._proxy_for(site): proxies = { @@ -302,11 +314,11 @@ class BrozzlerWorker: 'https': 'http://%s' % self._proxy_for(site), } - self.logger.info('fetching %s', page) + self.logger.info('fetching %s', url) try: # response is ignored requests.get( - page.url, proxies=proxies, headers=site.extra_headers(), + url, proxies=proxies, headers=site.extra_headers(), verify=False) except requests.exceptions.ProxyError as e: raise brozzler.ProxyError( diff --git a/setup.py b/setup.py index acebd76..d873804 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev313', + version='1.5.dev314', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',