mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-20 20:54:45 -04:00
Merge branch 'pageInterstitialShown' into qa
This commit is contained in:
commit
24fcca4919
2 changed files with 5 additions and 24 deletions
|
@ -30,6 +30,9 @@ class NothingToClaim(Exception):
|
||||||
class CrawlStopped(Exception):
|
class CrawlStopped(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class PageInterstitialShown(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
class ProxyError(Exception):
|
class ProxyError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -58,28 +61,6 @@ class ReachedLimit(Exception):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
||||||
class PageInterstitialShown(Exception):
|
|
||||||
def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
|
|
||||||
import json
|
|
||||||
if http_error:
|
|
||||||
if "warcprox-meta" in http_error.headers:
|
|
||||||
self.warcprox_meta = json.loads(
|
|
||||||
http_error.headers["warcprox-meta"])
|
|
||||||
else:
|
|
||||||
self.warcprox_meta = None
|
|
||||||
self.http_payload = http_error.read()
|
|
||||||
elif warcprox_meta:
|
|
||||||
self.warcprox_meta = warcprox_meta
|
|
||||||
self.http_payload = http_payload
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "PageInterstitialShown(warcprox_meta=%r,http_payload=%r)" % (
|
|
||||||
self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
|
|
||||||
self.http_payload if hasattr(self, 'http_payload') else None)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.__repr__()
|
|
||||||
|
|
||||||
# monkey-patch log levels TRACE and NOTICE
|
# monkey-patch log levels TRACE and NOTICE
|
||||||
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
|
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
|
||||||
def _logger_trace(self, msg, *args, **kwargs):
|
def _logger_trace(self, msg, *args, **kwargs):
|
||||||
|
|
|
@ -212,10 +212,10 @@ class BrozzlerWorker:
|
||||||
try:
|
try:
|
||||||
outlinks = self._browse_page(browser, site, page, on_screenshot,
|
outlinks = self._browse_page(browser, site, page, on_screenshot,
|
||||||
on_request)
|
on_request)
|
||||||
return outlinks
|
|
||||||
except brozzler.PageInterstitialShown:
|
except brozzler.PageInterstitialShown:
|
||||||
|
outlinks = []
|
||||||
self.logger.info('page interstitial shown (http auth): %s', page)
|
self.logger.info('page interstitial shown (http auth): %s', page)
|
||||||
return []
|
return outlinks
|
||||||
else:
|
else:
|
||||||
if not self._already_fetched(page, ydl_fetches):
|
if not self._already_fetched(page, ydl_fetches):
|
||||||
self.logger.info('needs fetch: %s', page)
|
self.logger.info('needs fetch: %s', page)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue