From b5cb94fc8b37f49c30d4f0b92cbe91ebdf1b8add Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 21 Jul 2015 06:33:02 +0000 Subject: [PATCH] some additional logging and error handling to avoid mysterious messages --- bin/brozzler-hq | 2 +- brozzler/browser.py | 8 +++++++- brozzler/hq.py | 2 +- brozzler/site.py | 8 ++++---- brozzler/worker.py | 11 +++++++---- 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/bin/brozzler-hq b/bin/brozzler-hq index 3c4e421..e91efc3 100755 --- a/bin/brozzler-hq +++ b/bin/brozzler-hq @@ -12,7 +12,7 @@ import signal arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), description="brozzler-hq - headquarters of distributed brozzler crawl", formatter_class=argparse.ArgumentDefaultsHelpFormatter) -arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq-0.db", +arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq.db", help="sqlite3 database filename; if the file does not exist, it will be created") arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f', help='URL identifying the amqp server to talk to') diff --git a/brozzler/browser.py b/brozzler/browser.py index 68318b2..4cf24b9 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -131,7 +131,7 @@ class Browser: self._outlinks = None self._websock = websocket.WebSocketApp(self._websocket_url, - on_open=self._visit_page, on_message=self._handle_message) + on_open=self._visit_page, on_message=self._wrap_handle_message) threadName = "WebsockThread{}-{}".format(self.chrome_port, ''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6)))) @@ -206,6 +206,12 @@ class Browser: # navigate to the page! self.send_to_chrome(method="Page.navigate", params={"url": self.url}) + def _wrap_handle_message(self, websock, message): + try: + self._handle_message(websock, message) + except: + self.logger.error("uncaught exception in _handle_message", exc_info=True) + def _handle_message(self, websock, message): # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) # self.logger.debug("message from {} - {}".format(websock.url, message)) diff --git a/brozzler/hq.py b/brozzler/hq.py index 09eeb30..12fcfa8 100644 --- a/brozzler/hq.py +++ b/brozzler/hq.py @@ -11,7 +11,7 @@ import kombu.simple class BrozzlerHQDb: logger = logging.getLogger(__module__ + "." + __qualname__) - def __init__(self, db_file="./brozzler-hq-0.db"): + def __init__(self, db_file="./brozzler-hq.db"): self._conn = sqlite3.connect(db_file) self._create_tables() diff --git a/brozzler/site.py b/brozzler/site.py index c16cf03..babd89c 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -36,8 +36,8 @@ class Site: self._robots_cache = reppy.cache.RobotsCache(session=req_sesh) def __repr__(self): - return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={},extra_headers={})""".format( - self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots, self.extra_headers) + return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format( + repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers) def note_seed_redirect(self, url): new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) @@ -77,8 +77,8 @@ class Page: self.redirect_url = redirect_url def __repr__(self): - return """Page(url="{}",site_id={},hops_from_seed={})""".format( - self.url, self.site_id, self.hops_from_seed) + return """Page(url={},site_id={},hops_from_seed={})""".format( + repr(self.url), self.site_id, self.hops_from_seed) def note_redirect(self, url): self.redirect_url = url diff --git a/brozzler/worker.py b/brozzler/worker.py index 721929d..f780ec7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -26,9 +26,9 @@ class BrozzlerWorker: def _youtube_dl(self, site): ydl_opts = { "outtmpl": "/dev/null", - "verbose": False, + "verbose": True, "retries": 1, - "logger": self.logger, + "logger": logging.getLogger("youtube_dl"), "nocheckcertificate": True, "hls_prefer_native": True, "noprogress": True, @@ -102,7 +102,7 @@ class BrozzlerWorker: payload=info_json.encode("utf-8"), extra_headers=site.extra_headers) except BaseException as e: - if youtube_dl.utils.UnsupportedError in e.exc_info: + if e.exc_info and youtube_dl.utils.UnsupportedError in e.exc_info: pass else: raise @@ -116,7 +116,10 @@ class BrozzlerWorker: extra_headers=site.extra_headers) self.logger.info("brozzling {}".format(page)) - self._try_youtube_dl(ydl, site, page) + try: + self._try_youtube_dl(ydl, site, page) + except: + self.logger.error("youtube_dl raised unexpected exception on {}".format(page), exc_info=True) page.outlinks = browser.browse_page(page.url, extra_headers=site.extra_headers,