diff --git a/brozzler/browser.py b/brozzler/browser.py index 9442d92..5c06359 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -376,7 +376,7 @@ class Browser: return self.websock_url is not None def browse_page( - self, page_url, ignore_cert_errors=False, extra_headers=None, + self, page_url, extra_headers=None, user_agent=None, behavior_parameters=None, on_request=None, on_response=None, on_screenshot=None, username=None, password=None, hashtags=None, diff --git a/brozzler/worker.py b/brozzler/worker.py index 3169e47..56cf93b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -316,30 +316,34 @@ class BrozzlerWorker: return full_jpeg, thumb_jpeg - def brozzle_page(self, browser, site, page, on_screenshot=None): + def brozzle_page(self, browser, site, page, on_screenshot=None, + enable_youtube_dl=True): self.logger.info("brozzling {}".format(page)) - try: - with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: - ydl = self._youtube_dl(tempdir, site) - ydl_spy = ydl.brozzler_spy # remember for later - self._try_youtube_dl(ydl, site, page) - except brozzler.ReachedLimit as e: - raise - except brozzler.ShutdownRequested: - raise - except brozzler.ProxyError: - raise - except Exception as e: - if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 - and hasattr(e.exc_info[1], 'code') - and e.exc_info[1].code == 430): - self.logger.info( - 'youtube-dl got %s %s processing %s', - e.exc_info[1].code, e.exc_info[1].msg, page.url) - else: - self.logger.error( - 'youtube_dl raised exception on %s', page, - exc_info=True) + if enable_youtube_dl: + try: + with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: + ydl = self._youtube_dl(tempdir, site) + ydl_spy = ydl.brozzler_spy # remember for later + self._try_youtube_dl(ydl, site, page) + except brozzler.ReachedLimit as e: + raise + except brozzler.ShutdownRequested: + raise + except brozzler.ProxyError: + raise + except Exception as e: + if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2 + and hasattr(e.exc_info[1], 'code') + and e.exc_info[1].code == 430): + self.logger.info( + 'youtube-dl got %s %s processing %s', + e.exc_info[1].code, e.exc_info[1].msg, page.url) + else: + self.logger.error( + 'youtube_dl raised exception on %s', page, + exc_info=True) + else: + ydl_spy = False if self._needs_browsing(page, ydl_spy): self.logger.info('needs browsing: %s', page) @@ -435,19 +439,23 @@ class BrozzlerWorker: 'proxy error fetching %s' % page.url) from e def _needs_browsing(self, page, brozzler_spy): - final_bounces = brozzler_spy.final_bounces(page.url) - if not final_bounces: - return True - for txn in final_bounces: - if txn['response_headers'].get_content_type() in [ - 'text/html', 'application/xhtml+xml']: + if brozzler_spy: + final_bounces = brozzler_spy.final_bounces(page.url) + if not final_bounces: return True - return False + for txn in final_bounces: + if txn['response_headers'].get_content_type() in [ + 'text/html', 'application/xhtml+xml']: + return True + return False + else: + return True def _already_fetched(self, page, brozzler_spy): - for txn in brozzler_spy.final_bounces(page.url): - if (txn['method'] == 'GET' and txn['status_code'] == 200): - return True + if brozzler_spy: + for txn in brozzler_spy.final_bounces(page.url): + if (txn['method'] == 'GET' and txn['status_code'] == 200): + return True return False def brozzle_site(self, browser, site): diff --git a/setup.py b/setup.py index 743bccb..f9ab4e2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev265', + version='1.1b12.dev266', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',