diff --git a/brozzler/__init__.py b/brozzler/__init__.py index b3fa183..6321203 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -30,6 +30,9 @@ class NothingToClaim(Exception): class CrawlStopped(Exception): pass +class PageInterstitialShown(Exception): + pass + class ProxyError(Exception): pass @@ -58,28 +61,6 @@ class ReachedLimit(Exception): def __str__(self): return self.__repr__() -class PageInterstitialShown(Exception): - def __init__(self, http_error=None, warcprox_meta=None, http_payload=None): - import json - if http_error: - if "warcprox-meta" in http_error.headers: - self.warcprox_meta = json.loads( - http_error.headers["warcprox-meta"]) - else: - self.warcprox_meta = None - self.http_payload = http_error.read() - elif warcprox_meta: - self.warcprox_meta = warcprox_meta - self.http_payload = http_payload - - def __repr__(self): - return "PageInterstitialShown(warcprox_meta=%r,http_payload=%r)" % ( - self.warcprox_meta if hasattr(self, 'warcprox_meta') else None, - self.http_payload if hasattr(self, 'http_payload') else None) - - def __str__(self): - return self.__repr__() - # monkey-patch log levels TRACE and NOTICE logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2 def _logger_trace(self, msg, *args, **kwargs): diff --git a/brozzler/worker.py b/brozzler/worker.py index 7e7d974..3bffdfd 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -212,10 +212,10 @@ class BrozzlerWorker: try: outlinks = self._browse_page(browser, site, page, on_screenshot, on_request) - return outlinks except brozzler.PageInterstitialShown: + outlinks = [] self.logger.info('page interstitial shown (http auth): %s', page) - return [] + return outlinks else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page)