mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-08 06:22:23 -04:00
Merge branch 'service-worker' into qa
This commit is contained in:
commit
b204e9aec1
3 changed files with 38 additions and 9 deletions
|
@ -152,6 +152,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
|
|
||||||
self.on_request = None
|
self.on_request = None
|
||||||
self.on_response = None
|
self.on_response = None
|
||||||
|
self.on_service_worker_version_updated = None
|
||||||
|
|
||||||
self._result_messages = {}
|
self._result_messages = {}
|
||||||
|
|
||||||
|
@ -261,6 +262,9 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
and 'params' in message and 'errorText' in message['params']
|
and 'params' in message and 'errorText' in message['params']
|
||||||
and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'):
|
and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'):
|
||||||
brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
|
brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
|
||||||
|
elif message['method'] == 'ServiceWorker.workerVersionUpdated':
|
||||||
|
if self.on_service_worker_version_updated:
|
||||||
|
self.on_service_worker_version_updated(message)
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s %s", message["method"], json_message)
|
# self.logger.debug("%s %s", message["method"], json_message)
|
||||||
elif 'result' in message:
|
elif 'result' in message:
|
||||||
|
@ -345,13 +349,14 @@ class Browser:
|
||||||
self.send_to_chrome(method='Page.enable')
|
self.send_to_chrome(method='Page.enable')
|
||||||
self.send_to_chrome(method='Console.enable')
|
self.send_to_chrome(method='Console.enable')
|
||||||
self.send_to_chrome(method='Runtime.enable')
|
self.send_to_chrome(method='Runtime.enable')
|
||||||
|
self.send_to_chrome(method='ServiceWorker.enable')
|
||||||
|
self.send_to_chrome(method='ServiceWorker.setForceUpdateOnPageLoad')
|
||||||
|
|
||||||
# disable google analytics
|
# disable google analytics
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method='Network.setBlockedURLs',
|
method='Network.setBlockedURLs',
|
||||||
params={'urls': ['*google-analytics.com/analytics.js',
|
params={'urls': ['*google-analytics.com/analytics.js',
|
||||||
'*google-analytics.com/ga.js']}
|
'*google-analytics.com/ga.js']})
|
||||||
)
|
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
'''
|
'''
|
||||||
|
@ -395,7 +400,8 @@ class Browser:
|
||||||
def browse_page(
|
def browse_page(
|
||||||
self, page_url, extra_headers=None,
|
self, page_url, extra_headers=None,
|
||||||
user_agent=None, behavior_parameters=None, behaviors_dir=None,
|
user_agent=None, behavior_parameters=None, behaviors_dir=None,
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None,
|
||||||
|
on_service_worker_version_updated=None, on_screenshot=None,
|
||||||
username=None, password=None, hashtags=None,
|
username=None, password=None, hashtags=None,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||||
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
||||||
|
@ -422,10 +428,18 @@ class Browser:
|
||||||
on_response: callback to invoke on every Network.responseReceived
|
on_response: callback to invoke on every Network.responseReceived
|
||||||
event, takes one argument, the json-decoded message (default
|
event, takes one argument, the json-decoded message (default
|
||||||
None)
|
None)
|
||||||
|
on_service_worker_version_updated: callback to invoke on every
|
||||||
|
ServiceWorker.workerVersionUpdated event, takes one argument,
|
||||||
|
the json-decoded message (default None)
|
||||||
on_screenshot: callback to invoke when screenshot is obtained,
|
on_screenshot: callback to invoke when screenshot is obtained,
|
||||||
takes one argument, the the raw jpeg bytes (default None)
|
takes one argument, the the raw jpeg bytes (default None)
|
||||||
# XXX takes two arguments, the url of the page at the time the
|
# XXX takes two arguments, the url of the page at the time the
|
||||||
# screenshot was taken, and the raw jpeg bytes (default None)
|
# screenshot was taken, and the raw jpeg bytes (default None)
|
||||||
|
username: username string to use to try logging in if a login form
|
||||||
|
is found in the page (default None)
|
||||||
|
password: password string to use to try logging in if a login form
|
||||||
|
is found in the page (default None)
|
||||||
|
... (there are more)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A tuple (final_page_url, outlinks).
|
A tuple (final_page_url, outlinks).
|
||||||
|
@ -448,6 +462,9 @@ class Browser:
|
||||||
self.websock_thread.on_request = on_request
|
self.websock_thread.on_request = on_request
|
||||||
if on_response:
|
if on_response:
|
||||||
self.websock_thread.on_response = on_response
|
self.websock_thread.on_response = on_response
|
||||||
|
if on_service_worker_version_updated:
|
||||||
|
self.websock_thread.on_service_worker_version_updated = \
|
||||||
|
on_service_worker_version_updated
|
||||||
try:
|
try:
|
||||||
with brozzler.thread_accept_exceptions():
|
with brozzler.thread_accept_exceptions():
|
||||||
self.configure_browser(
|
self.configure_browser(
|
||||||
|
|
|
@ -219,7 +219,7 @@ class BrozzlerWorker:
|
||||||
else:
|
else:
|
||||||
if not self._already_fetched(page, ydl_fetches):
|
if not self._already_fetched(page, ydl_fetches):
|
||||||
self.logger.info('needs fetch: %s', page)
|
self.logger.info('needs fetch: %s', page)
|
||||||
self._fetch_url(site, page)
|
self._fetch_url(site, page.url)
|
||||||
else:
|
else:
|
||||||
self.logger.info('already fetched: %s', page)
|
self.logger.info('already fetched: %s', page)
|
||||||
|
|
||||||
|
@ -274,6 +274,16 @@ class BrozzlerWorker:
|
||||||
page.videos = []
|
page.videos = []
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
|
|
||||||
|
sw_fetched = set()
|
||||||
|
def _on_service_worker_version_updated(chrome_msg):
|
||||||
|
# https://github.com/internetarchive/brozzler/issues/140
|
||||||
|
self.logger.trace('%r', chrome_msg)
|
||||||
|
url = chrome_msg.get('params', {}).get('versions', [{}])[0].get('scriptURL')
|
||||||
|
if url not in sw_fetched:
|
||||||
|
self.logger.info('fetching service worker script %s', url)
|
||||||
|
self._fetch_url(site, url)
|
||||||
|
sw_fetched.add(url)
|
||||||
|
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
browser.start(
|
browser.start(
|
||||||
proxy=self._proxy_for(site),
|
proxy=self._proxy_for(site),
|
||||||
|
@ -284,7 +294,9 @@ class BrozzlerWorker:
|
||||||
username=site.get('username'), password=site.get('password'),
|
username=site.get('username'), password=site.get('password'),
|
||||||
user_agent=site.get('user_agent'),
|
user_agent=site.get('user_agent'),
|
||||||
on_screenshot=_on_screenshot, on_response=_on_response,
|
on_screenshot=_on_screenshot, on_response=_on_response,
|
||||||
on_request=on_request, hashtags=page.hashtags,
|
on_request=on_request,
|
||||||
|
on_service_worker_version_updated=_on_service_worker_version_updated,
|
||||||
|
hashtags=page.hashtags,
|
||||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||||
skip_youtube_dl=self._skip_youtube_dl,
|
skip_youtube_dl=self._skip_youtube_dl,
|
||||||
|
@ -294,7 +306,7 @@ class BrozzlerWorker:
|
||||||
page.note_redirect(final_page_url)
|
page.note_redirect(final_page_url)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _fetch_url(self, site, page):
|
def _fetch_url(self, site, url):
|
||||||
proxies = None
|
proxies = None
|
||||||
if self._proxy_for(site):
|
if self._proxy_for(site):
|
||||||
proxies = {
|
proxies = {
|
||||||
|
@ -302,11 +314,11 @@ class BrozzlerWorker:
|
||||||
'https': 'http://%s' % self._proxy_for(site),
|
'https': 'http://%s' % self._proxy_for(site),
|
||||||
}
|
}
|
||||||
|
|
||||||
self.logger.info('fetching %s', page)
|
self.logger.info('fetching %s', url)
|
||||||
try:
|
try:
|
||||||
# response is ignored
|
# response is ignored
|
||||||
requests.get(
|
requests.get(
|
||||||
page.url, proxies=proxies, headers=site.extra_headers(),
|
url, proxies=proxies, headers=site.extra_headers(),
|
||||||
verify=False)
|
verify=False)
|
||||||
except requests.exceptions.ProxyError as e:
|
except requests.exceptions.ProxyError as e:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.5.dev313',
|
version='1.5.dev314',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue