mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-09 06:52:46 -04:00
Merge branch 'master' into qa
* master: bump dev version number after some PR merges bugfix for BrozzlerWorker._needs_browsing Remove redundant method parameter. bugfix Make youtube-dl optional in BrozzlerWorker.brozzle_page
This commit is contained in:
commit
5be7dd4407
3 changed files with 43 additions and 35 deletions
|
@ -376,7 +376,7 @@ class Browser:
|
||||||
return self.websock_url is not None
|
return self.websock_url is not None
|
||||||
|
|
||||||
def browse_page(
|
def browse_page(
|
||||||
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
self, page_url, extra_headers=None,
|
||||||
user_agent=None, behavior_parameters=None,
|
user_agent=None, behavior_parameters=None,
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
username=None, password=None, hashtags=None,
|
username=None, password=None, hashtags=None,
|
||||||
|
|
|
@ -316,30 +316,34 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
return full_jpeg, thumb_jpeg
|
return full_jpeg, thumb_jpeg
|
||||||
|
|
||||||
def brozzle_page(self, browser, site, page, on_screenshot=None):
|
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
||||||
|
enable_youtube_dl=True):
|
||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
try:
|
if enable_youtube_dl:
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
try:
|
||||||
ydl = self._youtube_dl(tempdir, site)
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
ydl_spy = ydl.brozzler_spy # remember for later
|
ydl = self._youtube_dl(tempdir, site)
|
||||||
self._try_youtube_dl(ydl, site, page)
|
ydl_spy = ydl.brozzler_spy # remember for later
|
||||||
except brozzler.ReachedLimit as e:
|
self._try_youtube_dl(ydl, site, page)
|
||||||
raise
|
except brozzler.ReachedLimit as e:
|
||||||
except brozzler.ShutdownRequested:
|
raise
|
||||||
raise
|
except brozzler.ShutdownRequested:
|
||||||
except brozzler.ProxyError:
|
raise
|
||||||
raise
|
except brozzler.ProxyError:
|
||||||
except Exception as e:
|
raise
|
||||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
except Exception as e:
|
||||||
and hasattr(e.exc_info[1], 'code')
|
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||||
and e.exc_info[1].code == 430):
|
and hasattr(e.exc_info[1], 'code')
|
||||||
self.logger.info(
|
and e.exc_info[1].code == 430):
|
||||||
'youtube-dl got %s %s processing %s',
|
self.logger.info(
|
||||||
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
'youtube-dl got %s %s processing %s',
|
||||||
else:
|
e.exc_info[1].code, e.exc_info[1].msg, page.url)
|
||||||
self.logger.error(
|
else:
|
||||||
'youtube_dl raised exception on %s', page,
|
self.logger.error(
|
||||||
exc_info=True)
|
'youtube_dl raised exception on %s', page,
|
||||||
|
exc_info=True)
|
||||||
|
else:
|
||||||
|
ydl_spy = False
|
||||||
|
|
||||||
if self._needs_browsing(page, ydl_spy):
|
if self._needs_browsing(page, ydl_spy):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info('needs browsing: %s', page)
|
||||||
|
@ -435,19 +439,23 @@ class BrozzlerWorker:
|
||||||
'proxy error fetching %s' % page.url) from e
|
'proxy error fetching %s' % page.url) from e
|
||||||
|
|
||||||
def _needs_browsing(self, page, brozzler_spy):
|
def _needs_browsing(self, page, brozzler_spy):
|
||||||
final_bounces = brozzler_spy.final_bounces(page.url)
|
if brozzler_spy:
|
||||||
if not final_bounces:
|
final_bounces = brozzler_spy.final_bounces(page.url)
|
||||||
return True
|
if not final_bounces:
|
||||||
for txn in final_bounces:
|
|
||||||
if txn['response_headers'].get_content_type() in [
|
|
||||||
'text/html', 'application/xhtml+xml']:
|
|
||||||
return True
|
return True
|
||||||
return False
|
for txn in final_bounces:
|
||||||
|
if txn['response_headers'].get_content_type() in [
|
||||||
|
'text/html', 'application/xhtml+xml']:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
def _already_fetched(self, page, brozzler_spy):
|
def _already_fetched(self, page, brozzler_spy):
|
||||||
for txn in brozzler_spy.final_bounces(page.url):
|
if brozzler_spy:
|
||||||
if (txn['method'] == 'GET' and txn['status_code'] == 200):
|
for txn in brozzler_spy.final_bounces(page.url):
|
||||||
return True
|
if (txn['method'] == 'GET' and txn['status_code'] == 200):
|
||||||
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def brozzle_site(self, browser, site):
|
def brozzle_site(self, browser, site):
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b12.dev265',
|
version='1.1b12.dev266',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue