mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 04:44:12 -04:00
some additional logging and error handling to avoid mysterious messages
This commit is contained in:
parent
1e56bc8686
commit
b5cb94fc8b
5 changed files with 20 additions and 11 deletions
|
@ -12,7 +12,7 @@ import signal
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
description="brozzler-hq - headquarters of distributed brozzler crawl",
|
description="brozzler-hq - headquarters of distributed brozzler crawl",
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq-0.db",
|
arg_parser.add_argument("-d", "--db", dest="db_file", default="./brozzler-hq.db",
|
||||||
help="sqlite3 database filename; if the file does not exist, it will be created")
|
help="sqlite3 database filename; if the file does not exist, it will be created")
|
||||||
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
|
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
|
||||||
help='URL identifying the amqp server to talk to')
|
help='URL identifying the amqp server to talk to')
|
||||||
|
|
|
@ -131,7 +131,7 @@ class Browser:
|
||||||
self._outlinks = None
|
self._outlinks = None
|
||||||
|
|
||||||
self._websock = websocket.WebSocketApp(self._websocket_url,
|
self._websock = websocket.WebSocketApp(self._websocket_url,
|
||||||
on_open=self._visit_page, on_message=self._handle_message)
|
on_open=self._visit_page, on_message=self._wrap_handle_message)
|
||||||
|
|
||||||
threadName = "WebsockThread{}-{}".format(self.chrome_port,
|
threadName = "WebsockThread{}-{}".format(self.chrome_port,
|
||||||
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
|
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
|
||||||
|
@ -206,6 +206,12 @@ class Browser:
|
||||||
# navigate to the page!
|
# navigate to the page!
|
||||||
self.send_to_chrome(method="Page.navigate", params={"url": self.url})
|
self.send_to_chrome(method="Page.navigate", params={"url": self.url})
|
||||||
|
|
||||||
|
def _wrap_handle_message(self, websock, message):
|
||||||
|
try:
|
||||||
|
self._handle_message(websock, message)
|
||||||
|
except:
|
||||||
|
self.logger.error("uncaught exception in _handle_message", exc_info=True)
|
||||||
|
|
||||||
def _handle_message(self, websock, message):
|
def _handle_message(self, websock, message):
|
||||||
# self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
|
# self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
|
||||||
# self.logger.debug("message from {} - {}".format(websock.url, message))
|
# self.logger.debug("message from {} - {}".format(websock.url, message))
|
||||||
|
|
|
@ -11,7 +11,7 @@ import kombu.simple
|
||||||
class BrozzlerHQDb:
|
class BrozzlerHQDb:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, db_file="./brozzler-hq-0.db"):
|
def __init__(self, db_file="./brozzler-hq.db"):
|
||||||
self._conn = sqlite3.connect(db_file)
|
self._conn = sqlite3.connect(db_file)
|
||||||
self._create_tables()
|
self._create_tables()
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,8 @@ class Site:
|
||||||
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
|
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
||||||
self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
@ -77,8 +77,8 @@ class Page:
|
||||||
self.redirect_url = redirect_url
|
self.redirect_url = redirect_url
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return """Page(url="{}",site_id={},hops_from_seed={})""".format(
|
return """Page(url={},site_id={},hops_from_seed={})""".format(
|
||||||
self.url, self.site_id, self.hops_from_seed)
|
repr(self.url), self.site_id, self.hops_from_seed)
|
||||||
|
|
||||||
def note_redirect(self, url):
|
def note_redirect(self, url):
|
||||||
self.redirect_url = url
|
self.redirect_url = url
|
||||||
|
|
|
@ -26,9 +26,9 @@ class BrozzlerWorker:
|
||||||
def _youtube_dl(self, site):
|
def _youtube_dl(self, site):
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"outtmpl": "/dev/null",
|
"outtmpl": "/dev/null",
|
||||||
"verbose": False,
|
"verbose": True,
|
||||||
"retries": 1,
|
"retries": 1,
|
||||||
"logger": self.logger,
|
"logger": logging.getLogger("youtube_dl"),
|
||||||
"nocheckcertificate": True,
|
"nocheckcertificate": True,
|
||||||
"hls_prefer_native": True,
|
"hls_prefer_native": True,
|
||||||
"noprogress": True,
|
"noprogress": True,
|
||||||
|
@ -102,7 +102,7 @@ class BrozzlerWorker:
|
||||||
payload=info_json.encode("utf-8"),
|
payload=info_json.encode("utf-8"),
|
||||||
extra_headers=site.extra_headers)
|
extra_headers=site.extra_headers)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
if youtube_dl.utils.UnsupportedError in e.exc_info:
|
if e.exc_info and youtube_dl.utils.UnsupportedError in e.exc_info:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
@ -116,7 +116,10 @@ class BrozzlerWorker:
|
||||||
extra_headers=site.extra_headers)
|
extra_headers=site.extra_headers)
|
||||||
|
|
||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
self._try_youtube_dl(ydl, site, page)
|
try:
|
||||||
|
self._try_youtube_dl(ydl, site, page)
|
||||||
|
except:
|
||||||
|
self.logger.error("youtube_dl raised unexpected exception on {}".format(page), exc_info=True)
|
||||||
|
|
||||||
page.outlinks = browser.browse_page(page.url,
|
page.outlinks = browser.browse_page(page.url,
|
||||||
extra_headers=site.extra_headers,
|
extra_headers=site.extra_headers,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue