handle 420 "Limit reached" when browser receives it

This commit is contained in:
Noah Levitt 2015-08-01 01:26:59 +00:00
parent f5acb6c34b
commit 511e19ff4d
3 changed files with 88 additions and 52 deletions

View File

@ -42,8 +42,11 @@ page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker() worker = brozzler.BrozzlerWorker()
ydl = worker._youtube_dl(site) ydl = worker._youtube_dl(site)
with brozzler.Browser(chrome_exe=args.chrome_exe) as browser: browser = brozzler.Browser(chrome_exe=args.chrome_exe)
try: browser.start(proxy=site.proxy)
worker.brozzle_page(browser, ydl, site, page) try:
except brozzler.ReachedLimit as e: worker.brozzle_page(browser, ydl, site, page)
logging.error("reached limit %s", e) except brozzler.ReachedLimit as e:
logging.error("reached limit %s", e)
finally:
browser.stop()

View File

@ -17,12 +17,16 @@ class ShutdownRequested(Exception):
pass pass
class ReachedLimit(Exception): class ReachedLimit(Exception):
def __init__(self, http_error): def __init__(self, http_error=None, warcprox_meta=None):
if "warcprox-meta" in http_error.headers: if http_error:
self.warcprox_meta = _json.loads(http_error.headers["warcprox-meta"]) if "warcprox-meta" in http_error.headers:
else: self.warcprox_meta = _json.loads(http_error.headers["warcprox-meta"])
self.warcprox_meta = None else:
self.http_payload = http_error.read() self.warcprox_meta = None
self.http_payload = http_error.read()
elif warcprox_meta:
self.warcprox_meta = warcprox_meta
self.http_payload = None
def __repr__(self): def __repr__(self):
return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload)) return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))

View File

@ -15,7 +15,9 @@ import os
import socket import socket
import base64 import base64
import random import random
import brozzler
from brozzler.behaviors import Behavior from brozzler.behaviors import Behavior
from requests.structures import CaseInsensitiveDict
class BrowserPool: class BrowserPool:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -129,6 +131,7 @@ class Browser:
self._waiting_on_document_url_msg_id = None self._waiting_on_document_url_msg_id = None
self._waiting_on_outlinks_msg_id = None self._waiting_on_outlinks_msg_id = None
self._outlinks = None self._outlinks = None
self._reached_limit = None
self._websock = websocket.WebSocketApp(self._websocket_url, self._websock = websocket.WebSocketApp(self._websocket_url,
on_open=self._visit_page, on_message=self._wrap_handle_message) on_open=self._visit_page, on_message=self._wrap_handle_message)
@ -182,6 +185,8 @@ class Browser:
elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS: elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url)) self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
return True return True
elif self._reached_limit:
raise self._reached_limit
elif self._abort_browse_page: elif self._abort_browse_page:
raise BrowsingAborted("browsing page aborted") raise BrowsingAborted("browsing page aborted")
@ -221,56 +226,80 @@ class Browser:
except: except:
self.logger.error("uncaught exception in _handle_message", exc_info=True) self.logger.error("uncaught exception in _handle_message", exc_info=True)
def _network_request_will_be_sent(self, message):
if self._behavior:
self._behavior.notify_of_activity()
if message["params"]["request"]["url"].lower().startswith("data:"):
self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
elif self.on_request:
self.on_request(message)
def _network_response_received(self, message):
if (not self._reached_limit
and message["params"]["response"]["status"] == 420
and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
warcprox_meta = json.loads(message["params"]["response"]["headers"]["Warcprox-Meta"])
self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
self.logger.info("reached limit %s", self._reached_limit)
def _page_load_event_fired(self, message):
self.logger.info("Page.loadEventFired, requesting screenshot url={} message={}".format(self.url, message))
self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot")
self._waiting_on_document_url_msg_id = self.send_to_chrome(method="Runtime.evaluate", params={"expression":"document.URL"})
def _console_message_added(self, message):
self.logger.debug("%s console.%s %s", self._websock.url,
message["params"]["message"]["level"],
message["params"]["message"]["text"])
def _debugger_paused(self, message):
# We hit the breakpoint set in visit_page. Get rid of google
# analytics script!
self.logger.debug("debugger paused! message={}".format(message))
scriptId = message['params']['callFrames'][0]['location']['scriptId']
# replace script
self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"})
# resume execution
self.send_to_chrome(method="Debugger.resume")
def _handle_result_message(self, message):
if message["id"] == self._waiting_on_screenshot_msg_id:
if self.on_screenshot:
self.on_screenshot(base64.b64decode(message["result"]["data"]))
self._waiting_on_screenshot_msg_id = None
self.logger.info("got screenshot, moving on to starting behaviors url={}".format(self.url))
self._behavior = Behavior(self.url, self)
self._behavior.start()
elif message["id"] == self._waiting_on_outlinks_msg_id:
self.logger.debug("got outlinks message=%s", message)
self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
elif message["id"] == self._waiting_on_document_url_msg_id:
if message["result"]["result"]["value"] != self.url:
if self.on_url_change:
self.on_url_change(message["result"]["result"]["value"])
self._waiting_on_document_url_msg_id = None
elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
self._behavior.notify_of_result(message)
def _handle_message(self, websock, message): def _handle_message(self, websock, message):
# self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
# self.logger.debug("message from {} - {}".format(websock.url, message)) # self.logger.debug("message from {} - {}".format(websock.url, message))
message = json.loads(message) message = json.loads(message)
if "method" in message and message["method"] == "Network.requestWillBeSent": if "method" in message and message["method"] == "Network.requestWillBeSent":
if self._behavior: self._network_request_will_be_sent(message)
self._behavior.notify_of_activity() elif "method" in message and message["method"] == "Network.responseReceived":
if message["params"]["request"]["url"].lower().startswith("data:"): self._network_response_received(message)
self.logger.debug("ignoring data url {}".format(message["params"]["request"]["url"][:80]))
elif self.on_request:
self.on_request(message)
elif "method" in message and message["method"] == "Page.loadEventFired": elif "method" in message and message["method"] == "Page.loadEventFired":
self.logger.info("Page.loadEventFired, requesting screenshot url={} message={}".format(self.url, message)) self._page_load_event_fired(message)
self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot")
self._waiting_on_document_url_msg_id = self.send_to_chrome(method="Runtime.evaluate", params={"expression":"document.URL"})
elif "method" in message and message["method"] == "Console.messageAdded": elif "method" in message and message["method"] == "Console.messageAdded":
self.logger.debug("{} console.{} {}".format(websock.url, self._console_message_added(message)
message["params"]["message"]["level"],
message["params"]["message"]["text"]))
elif "method" in message and message["method"] == "Debugger.paused": elif "method" in message and message["method"] == "Debugger.paused":
# We hit the breakpoint set in visit_page. Get rid of google self._debugger_paused(message)
# analytics script!
self.logger.debug("debugger paused! message={}".format(message))
scriptId = message['params']['callFrames'][0]['location']['scriptId']
# replace script
self.send_to_chrome(method="Debugger.setScriptSource", params={"scriptId": scriptId, "scriptSource":"console.log('google analytics is no more!');"})
# resume execution
self.send_to_chrome(method="Debugger.resume")
elif "result" in message: elif "result" in message:
if message["id"] == self._waiting_on_screenshot_msg_id: self._handle_result_message(message)
if self.on_screenshot:
self.on_screenshot(base64.b64decode(message["result"]["data"]))
self._waiting_on_screenshot_msg_id = None
self.logger.info("got screenshot, moving on to starting behaviors url={}".format(self.url))
self._behavior = Behavior(self.url, self)
self._behavior.start()
elif message["id"] == self._waiting_on_outlinks_msg_id:
self.logger.debug("got outlinks message={}".format(message))
# {'result': {'wasThrown': False, 'result': {'value': 'https://archive-it.org/cgi-bin/dedup-test/change_every_second https://archive-it.org/cgi-bin/dedup-test/change_every_minute https://archive-it.org/cgi-bin/dedup-test/change_every_10minutes https://archive-it.org/cgi-bin/dedup-test/change_every_hour https://archive-it.org/cgi-bin/dedup-test/change_every_day https://archive-it.org/cgi-bin/dedup-test/change_every_month https://archive-it.org/cgi-bin/dedup-test/change_every_year https://archive-it.org/cgi-bin/dedup-test/change_never http://validator.w3.org/check?uri=referer', 'type': 'string'}}, 'id': 32}
self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
elif message["id"] == self._waiting_on_document_url_msg_id:
if message["result"]["result"]["value"] != self.url:
if self.on_url_change:
self.on_url_change(message["result"]["result"]["value"])
self._waiting_on_document_url_msg_id = None
elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
self._behavior.notify_of_result(message)
# elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"): # elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"):
# pass # pass
# elif "method" in message: # elif "method" in message: