mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
some additional logging and error handling to avoid mysterious messages
This commit is contained in:
parent
1e56bc8686
commit
b5cb94fc8b
@ -12,7 +12,7 @@ import signal
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||
description="brozzler-hq - headquarters of distributed brozzler crawl",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq-0.db",
|
||||
arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq.db",
|
||||
help="sqlite3 database filename; if the file does not exist, it will be created")
|
||||
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
|
||||
help='URL identifying the amqp server to talk to')
|
||||
|
@ -131,7 +131,7 @@ class Browser:
|
||||
self._outlinks = None
|
||||
|
||||
self._websock = websocket.WebSocketApp(self._websocket_url,
|
||||
on_open=self._visit_page, on_message=self._handle_message)
|
||||
on_open=self._visit_page, on_message=self._wrap_handle_message)
|
||||
|
||||
threadName = "WebsockThread{}-{}".format(self.chrome_port,
|
||||
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
|
||||
@ -206,6 +206,12 @@ class Browser:
|
||||
# navigate to the page!
|
||||
self.send_to_chrome(method="Page.navigate", params={"url": self.url})
|
||||
|
||||
def _wrap_handle_message(self, websock, message):
|
||||
try:
|
||||
self._handle_message(websock, message)
|
||||
except:
|
||||
self.logger.error("uncaught exception in _handle_message", exc_info=True)
|
||||
|
||||
def _handle_message(self, websock, message):
|
||||
# self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
|
||||
# self.logger.debug("message from {} - {}".format(websock.url, message))
|
||||
|
@ -11,7 +11,7 @@ import kombu.simple
|
||||
class BrozzlerHQDb:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, db_file="./brozzler-hq-0.db"):
|
||||
def __init__(self, db_file="./brozzler-hq.db"):
|
||||
self._conn = sqlite3.connect(db_file)
|
||||
self._create_tables()
|
||||
|
||||
|
@ -36,8 +36,8 @@ class Site:
|
||||
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
|
||||
|
||||
def __repr__(self):
|
||||
return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
||||
self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
||||
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
||||
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
||||
|
||||
def note_seed_redirect(self, url):
|
||||
new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||
@ -77,8 +77,8 @@ class Page:
|
||||
self.redirect_url = redirect_url
|
||||
|
||||
def __repr__(self):
|
||||
return """Page(url="{}",site_id={},hops_from_seed={})""".format(
|
||||
self.url, self.site_id, self.hops_from_seed)
|
||||
return """Page(url={},site_id={},hops_from_seed={})""".format(
|
||||
repr(self.url), self.site_id, self.hops_from_seed)
|
||||
|
||||
def note_redirect(self, url):
|
||||
self.redirect_url = url
|
||||
|
@ -26,9 +26,9 @@ class BrozzlerWorker:
|
||||
def _youtube_dl(self, site):
|
||||
ydl_opts = {
|
||||
"outtmpl": "/dev/null",
|
||||
"verbose": False,
|
||||
"verbose": True,
|
||||
"retries": 1,
|
||||
"logger": self.logger,
|
||||
"logger": logging.getLogger("youtube_dl"),
|
||||
"nocheckcertificate": True,
|
||||
"hls_prefer_native": True,
|
||||
"noprogress": True,
|
||||
@ -102,7 +102,7 @@ class BrozzlerWorker:
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers)
|
||||
except BaseException as e:
|
||||
if youtube_dl.utils.UnsupportedError in e.exc_info:
|
||||
if e.exc_info and youtube_dl.utils.UnsupportedError in e.exc_info:
|
||||
pass
|
||||
else:
|
||||
raise
|
||||
@ -116,7 +116,10 @@ class BrozzlerWorker:
|
||||
extra_headers=site.extra_headers)
|
||||
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
try:
|
||||
self._try_youtube_dl(ydl, site, page)
|
||||
except:
|
||||
self.logger.error("youtube_dl raised unexpected exception on {}".format(page), exc_info=True)
|
||||
|
||||
page.outlinks = browser.browse_page(page.url,
|
||||
extra_headers=site.extra_headers,
|
||||
|
Loading…
x
Reference in New Issue
Block a user