some additional logging and error handling to avoid mysterious messages

This commit is contained in:
Noah Levitt 2015-07-21 06:33:02 +00:00
parent 1e56bc8686
commit b5cb94fc8b
5 changed files with 20 additions and 11 deletions

View File

@ -12,7 +12,7 @@ import signal
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description="brozzler-hq - headquarters of distributed brozzler crawl",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq-0.db",
arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq.db",
help="sqlite3 database filename; if the file does not exist, it will be created")
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
help='URL identifying the amqp server to talk to')

View File

@ -131,7 +131,7 @@ class Browser:
self._outlinks = None
self._websock = websocket.WebSocketApp(self._websocket_url,
on_open=self._visit_page, on_message=self._handle_message)
on_open=self._visit_page, on_message=self._wrap_handle_message)
threadName = "WebsockThread{}-{}".format(self.chrome_port,
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
@ -206,6 +206,12 @@ class Browser:
# navigate to the page!
self.send_to_chrome(method="Page.navigate", params={"url": self.url})
def _wrap_handle_message(self, websock, message):
try:
self._handle_message(websock, message)
except:
self.logger.error("uncaught exception in _handle_message", exc_info=True)
def _handle_message(self, websock, message):
# self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
# self.logger.debug("message from {} - {}".format(websock.url, message))

View File

@ -11,7 +11,7 @@ import kombu.simple
class BrozzlerHQDb:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, db_file="./brozzler-hq-0.db"):
def __init__(self, db_file="./brozzler-hq.db"):
self._conn = sqlite3.connect(db_file)
self._create_tables()

View File

@ -36,8 +36,8 @@ class Site:
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
def __repr__(self):
return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
def note_seed_redirect(self, url):
new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
@ -77,8 +77,8 @@ class Page:
self.redirect_url = redirect_url
def __repr__(self):
return """Page(url="{}",site_id={},hops_from_seed={})""".format(
self.url, self.site_id, self.hops_from_seed)
return """Page(url={},site_id={},hops_from_seed={})""".format(
repr(self.url), self.site_id, self.hops_from_seed)
def note_redirect(self, url):
self.redirect_url = url

View File

@ -26,9 +26,9 @@ class BrozzlerWorker:
def _youtube_dl(self, site):
ydl_opts = {
"outtmpl": "/dev/null",
"verbose": False,
"verbose": True,
"retries": 1,
"logger": self.logger,
"logger": logging.getLogger("youtube_dl"),
"nocheckcertificate": True,
"hls_prefer_native": True,
"noprogress": True,
@ -102,7 +102,7 @@ class BrozzlerWorker:
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers)
except BaseException as e:
if youtube_dl.utils.UnsupportedError in e.exc_info:
if e.exc_info and youtube_dl.utils.UnsupportedError in e.exc_info:
pass
else:
raise
@ -116,7 +116,10 @@ class BrozzlerWorker:
extra_headers=site.extra_headers)
self.logger.info("brozzling {}".format(page))
try:
self._try_youtube_dl(ydl, site, page)
except:
self.logger.error("youtube_dl raised unexpected exception on {}".format(page), exc_info=True)
page.outlinks = browser.browse_page(page.url,
extra_headers=site.extra_headers,