mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-08 06:22:23 -04:00
Merge branch 'pageInterstitialShown' into qa
This commit is contained in:
commit
5901434c2b
4 changed files with 16 additions and 6 deletions
|
@ -229,6 +229,11 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
self.on_request(message)
|
self.on_request(message)
|
||||||
elif message['method'] == 'Debugger.paused':
|
elif message['method'] == 'Debugger.paused':
|
||||||
self._debugger_paused(message)
|
self._debugger_paused(message)
|
||||||
|
elif message['method'] == 'Page.interstitialShown':
|
||||||
|
# for AITFIVE-1529: handle http auth
|
||||||
|
# for now, we should consider killing the browser when we receive Page.interstitialShown and
|
||||||
|
# consider the page finished—-first we should figure out when else that event might happen
|
||||||
|
self.logger.info('Page.interstitialShown received')
|
||||||
elif message['method'] == 'Inspector.targetCrashed':
|
elif message['method'] == 'Inspector.targetCrashed':
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'''chrome tab went "aw snap" or "he's dead jim"!''')
|
'''chrome tab went "aw snap" or "he's dead jim"!''')
|
||||||
|
@ -494,13 +499,16 @@ class Browser:
|
||||||
|
|
||||||
def configure_browser(self, extra_headers=None, user_agent=None):
|
def configure_browser(self, extra_headers=None, user_agent=None):
|
||||||
headers = extra_headers or {}
|
headers = extra_headers or {}
|
||||||
headers['Accept-Encoding'] = 'identity'
|
headers['Accept-Encoding'] = 'gzip' # avoid encodings br, sdch
|
||||||
self.send_to_chrome(
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
|
msg_id = self.send_to_chrome(
|
||||||
method='Network.setExtraHTTPHeaders',
|
method='Network.setExtraHTTPHeaders',
|
||||||
params={'headers': headers})
|
params={'headers': headers})
|
||||||
|
self._wait_for(
|
||||||
|
lambda: self.websock_thread.received_result(msg_id),
|
||||||
|
timeout=10)
|
||||||
if user_agent:
|
if user_agent:
|
||||||
self.send_to_chrome(
|
msg_id = self.send_to_chrome(
|
||||||
method='Network.setUserAgentOverride',
|
method='Network.setUserAgentOverride',
|
||||||
params={'userAgent': user_agent})
|
params={'userAgent': user_agent})
|
||||||
|
|
||||||
|
|
|
@ -500,6 +500,8 @@ def brozzler_list_sites(argv=None):
|
||||||
reql = reql.between(
|
reql = reql.between(
|
||||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
||||||
index='sites_last_disclaimed')
|
index='sites_last_disclaimed')
|
||||||
|
elif args.site:
|
||||||
|
reql = reql.get_all(args.site)
|
||||||
logging.debug('querying rethinkdb: %s', reql)
|
logging.debug('querying rethinkdb: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
if args.yaml:
|
if args.yaml:
|
||||||
|
|
|
@ -226,7 +226,7 @@ class BrozzlerWorker:
|
||||||
request.set_proxy(warcprox_address, "http")
|
request.set_proxy(warcprox_address, "http")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with urllib.request.urlopen(request) as response:
|
with urllib.request.urlopen(request, timeout=600) as response:
|
||||||
if response.getcode() != 204:
|
if response.getcode() != 204:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
'got "%s %s" response on warcprox '
|
'got "%s %s" response on warcprox '
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b12.dev272',
|
version='1.1b12.dev276',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue