From 11fbbc9d492fcc935229693e2fe5490c1bfbf717 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 31 Jul 2015 00:03:13 +0000 Subject: [PATCH] change browse-url command to brozzle-page, which does some more stuff as if it were in brozzler, like youtube_dl, warcprox features, etc --- bin/browse-url | 40 --------------------------------------- bin/brozzle-page | 46 +++++++++++++++++++++++++++++++++++++++++++++ brozzler/browser.py | 9 +++++++-- brozzler/site.py | 4 ++-- brozzler/worker.py | 5 +++-- 5 files changed, 58 insertions(+), 46 deletions(-) delete mode 100755 bin/browse-url create mode 100755 bin/brozzle-page diff --git a/bin/browse-url b/bin/browse-url deleted file mode 100755 index 47e106c..0000000 --- a/bin/browse-url +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# vim: set sw=4 et: - -import argparse -import os -import sys -import logging -import brozzler -import re -import datetime - -arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), - description='browse-url - open urls in chrome/chromium and run behaviors', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -arg_parser.add_argument('urls', metavar='URL', nargs='+', help='URL(s) to browse') -arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60', - help='seconds to wait for browser initialization') -arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', - help='executable to use to invoke chrome') -arg_parser.add_argument('-v', '--verbose', dest='log_level', - action="store_const", default=logging.INFO, const=logging.DEBUG) -arg_parser.add_argument('--version', action='version', - version="brozzler {} - {}".format(brozzler.version, os.path.basename(__file__))) -args = arg_parser.parse_args(args=sys.argv[1:]) - -logging.basicConfig(stream=sys.stdout, level=args.log_level, - format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') - - -with brozzler.Browser(chrome_exe=args.chrome_exe) as browser: - for url in args.urls: - - def on_screenshot(screenshot_png): - filename = "{}-{:%Y%m%d%H%M%S}.png".format(re.sub(r"\W", "_", url), datetime.datetime.now()) - with open(filename, mode='wb') as png_out: - png_out.write(screenshot_png) - logging.info("wrote screenshot to {}".format(filename)) - - browser.browse_page(url, on_screenshot=on_screenshot) - diff --git a/bin/brozzle-page b/bin/brozzle-page new file mode 100755 index 0000000..dd637d8 --- /dev/null +++ b/bin/brozzle-page @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# vim: set sw=4 et: + +import argparse +import os +import sys +import logging +import brozzler +import kombu +import re + +arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), + description="brozzle-page - brozzle a single page", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +arg_parser.add_argument('url', metavar='URL', help='page url') +arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', + help='executable to use to invoke chrome') +arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site") +arg_parser.add_argument("-H", "--extra-header", action="append", + dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)") +arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features', + action='store_true', help='enable special features for this site that assume the configured proxy is warcprox') +arg_parser.add_argument("-v", "--verbose", dest="log_level", + action="store_const", default=logging.INFO, const=logging.DEBUG) +arg_parser.add_argument("--version", action="version", + version="brozzler {} - {}".format(brozzler.version, os.path.basename(__file__))) +args = arg_parser.parse_args(args=sys.argv[1:]) + +logging.basicConfig(stream=sys.stdout, level=args.log_level, + format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s") + +extra_headers = {} +if args.extra_headers: + for hh in args.extra_headers: + [k,v] = re.split(r":\s*", hh, 1) + extra_headers[k] = v + +site = brozzler.Site(id=-1, seed=args.url, proxy=args.proxy, + enable_warcprox_features=args.enable_warcprox_features, + extra_headers=extra_headers) +page = brozzler.Page(url=args.url, site_id=site.id) +worker = brozzler.BrozzlerWorker() +ydl = worker._youtube_dl(site) + +with brozzler.Browser(chrome_exe=args.chrome_exe) as browser: + worker.brozzle_page(browser, ydl, site, page) diff --git a/brozzler/browser.py b/brozzler/browser.py index 0432eeb..79d7a95 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -142,7 +142,11 @@ class Browser: try: while True: + before_sleep = time.time() time.sleep(0.5) + after_sleep = time.time() + if after_sleep - before_sleep > 1: + self.logger.warn("slept for %d seconds?!?!?! (should have been ~0.5)", (after_sleep - before_sleep)) if self._browse_interval_func(): return self._outlinks finally: @@ -320,7 +324,7 @@ class Chrome: while True: try: - raw_json = urllib.request.urlopen(json_url).read() + raw_json = urllib.request.urlopen(json_url, timeout=30).read() all_debug_info = json.loads(raw_json.decode('utf-8')) debug_info = [x for x in all_debug_info if x['url'] == 'about:blank'] @@ -329,7 +333,8 @@ class Chrome: url = debug_info[0]['webSocketDebuggerUrl'] self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url)) return url - except: + except BaseException as e: + self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e) pass finally: if time.time() - self._start > timeout_sec: diff --git a/brozzler/site.py b/brozzler/site.py index 8a18537..016237e 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -24,8 +24,8 @@ class Site: self.scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True) def __repr__(self): - return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format( - repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers) + return """Site(id={},seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format( + self.id, repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers) def note_seed_redirect(self, url): new_scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(url)).getURLString(surt=True, trailing_comma=True) diff --git a/brozzler/worker.py b/brozzler/worker.py index 95b6cda..826dc63 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -104,10 +104,11 @@ class BrozzlerWorker: except BaseException as e: if hasattr(e, "exc_info") and youtube_dl.utils.UnsupportedError in e.exc_info: pass + # elif hasattr(e, "exc_info") and youtube_dl.utils.UnsupportedError in e.exc_info: else: raise - def _brozzle_page(self, browser, ydl, site, page): + def brozzle_page(self, browser, ydl, site, page): def on_screenshot(screenshot_png): if site.proxy and site.enable_warcprox_features: self.logger.info("sending PUTMETA request to warcprox with screenshot for {}".format(page)) @@ -134,7 +135,7 @@ class BrozzlerWorker: while not self._shutdown_requested.is_set() and time.time() - start < 60: try: page = self._next_page(site) - self._brozzle_page(browser, ydl, site, page) + self.brozzle_page(browser, ydl, site, page) self._completed_page(site, page) page = None except kombu.simple.Empty: