Merge branch 'master' into qa

* master:
  bump dev version number after some PR merges
  bugfix for BrozzlerWorker._needs_browsing
  Remove redundant method parameter.
  bugfix
  Make youtube-dl optional in BrozzlerWorker.brozzle_page
This commit is contained in:
Noah Levitt 2017-08-01 12:05:07 -07:00
commit 5be7dd4407
3 changed files with 43 additions and 35 deletions

View file

@ -376,7 +376,7 @@ class Browser:
return self.websock_url is not None return self.websock_url is not None
def browse_page( def browse_page(
self, page_url, ignore_cert_errors=False, extra_headers=None, self, page_url, extra_headers=None,
user_agent=None, behavior_parameters=None, user_agent=None, behavior_parameters=None,
on_request=None, on_response=None, on_screenshot=None, on_request=None, on_response=None, on_screenshot=None,
username=None, password=None, hashtags=None, username=None, password=None, hashtags=None,

View file

@ -316,30 +316,34 @@ class BrozzlerWorker:
return full_jpeg, thumb_jpeg return full_jpeg, thumb_jpeg
def brozzle_page(self, browser, site, page, on_screenshot=None): def brozzle_page(self, browser, site, page, on_screenshot=None,
enable_youtube_dl=True):
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
try: if enable_youtube_dl:
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: try:
ydl = self._youtube_dl(tempdir, site) with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl_spy = ydl.brozzler_spy # remember for later ydl = self._youtube_dl(tempdir, site)
self._try_youtube_dl(ydl, site, page) ydl_spy = ydl.brozzler_spy # remember for later
except brozzler.ReachedLimit as e: self._try_youtube_dl(ydl, site, page)
raise except brozzler.ReachedLimit as e:
except brozzler.ShutdownRequested: raise
raise except brozzler.ShutdownRequested:
except brozzler.ProxyError: raise
raise except brozzler.ProxyError:
except Exception as e: raise
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 except Exception as e:
and hasattr(e.exc_info[1], 'code') if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and e.exc_info[1].code == 430): and hasattr(e.exc_info[1], 'code')
self.logger.info( and e.exc_info[1].code == 430):
'youtube-dl got %s %s processing %s', self.logger.info(
e.exc_info[1].code, e.exc_info[1].msg, page.url) 'youtube-dl got %s %s processing %s',
else: e.exc_info[1].code, e.exc_info[1].msg, page.url)
self.logger.error( else:
'youtube_dl raised exception on %s', page, self.logger.error(
exc_info=True) 'youtube_dl raised exception on %s', page,
exc_info=True)
else:
ydl_spy = False
if self._needs_browsing(page, ydl_spy): if self._needs_browsing(page, ydl_spy):
self.logger.info('needs browsing: %s', page) self.logger.info('needs browsing: %s', page)
@ -435,19 +439,23 @@ class BrozzlerWorker:
'proxy error fetching %s' % page.url) from e 'proxy error fetching %s' % page.url) from e
def _needs_browsing(self, page, brozzler_spy): def _needs_browsing(self, page, brozzler_spy):
final_bounces = brozzler_spy.final_bounces(page.url) if brozzler_spy:
if not final_bounces: final_bounces = brozzler_spy.final_bounces(page.url)
return True if not final_bounces:
for txn in final_bounces:
if txn['response_headers'].get_content_type() in [
'text/html', 'application/xhtml+xml']:
return True return True
return False for txn in final_bounces:
if txn['response_headers'].get_content_type() in [
'text/html', 'application/xhtml+xml']:
return True
return False
else:
return True
def _already_fetched(self, page, brozzler_spy): def _already_fetched(self, page, brozzler_spy):
for txn in brozzler_spy.final_bounces(page.url): if brozzler_spy:
if (txn['method'] == 'GET' and txn['status_code'] == 200): for txn in brozzler_spy.final_bounces(page.url):
return True if (txn['method'] == 'GET' and txn['status_code'] == 200):
return True
return False return False
def brozzle_site(self, browser, site): def brozzle_site(self, browser, site):

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b12.dev265', version='1.1b12.dev266',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',