change browse-url command to brozzle-page, which does some more stuff as if it were in brozzler, like youtube_dl, warcprox features, etc

This commit is contained in:
Noah Levitt 2015-07-31 00:03:13 +00:00
parent 8366bd2d66
commit 11fbbc9d49
5 changed files with 58 additions and 46 deletions

View File

@ -1,40 +0,0 @@
#!/usr/bin/env python
# vim: set sw=4 et:
import argparse
import os
import sys
import logging
import brozzler
import re
import datetime
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description='browse-url - open urls in chrome/chromium and run behaviors',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('urls', metavar='URL', nargs='+', help='URL(s) to browse')
arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60',
help='seconds to wait for browser initialization')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument('-v', '--verbose', dest='log_level',
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument('--version', action='version',
version="brozzler {} - {}".format(brozzler.version, os.path.basename(__file__)))
args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
with brozzler.Browser(chrome_exe=args.chrome_exe) as browser:
for url in args.urls:
def on_screenshot(screenshot_png):
filename = "{}-{:%Y%m%d%H%M%S}.png".format(re.sub(r"\W", "_", url), datetime.datetime.now())
with open(filename, mode='wb') as png_out:
png_out.write(screenshot_png)
logging.info("wrote screenshot to {}".format(filename))
browser.browse_page(url, on_screenshot=on_screenshot)

46
bin/brozzle-page Executable file
View File

@ -0,0 +1,46 @@
#!/usr/bin/env python
# vim: set sw=4 et:
import argparse
import os
import sys
import logging
import brozzler
import kombu
import re
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description="brozzle-page - brozzle a single page",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('url', metavar='URL', help='page url')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
arg_parser.add_argument("-H", "--extra-header", action="append",
dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)")
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
arg_parser.add_argument("-v", "--verbose", dest="log_level",
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument("--version", action="version",
version="brozzler {} - {}".format(brozzler.version, os.path.basename(__file__)))
args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
extra_headers = {}
if args.extra_headers:
for hh in args.extra_headers:
[k,v] = re.split(r":\s*", hh, 1)
extra_headers[k] = v
site = brozzler.Site(id=-1, seed=args.url, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features,
extra_headers=extra_headers)
page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker()
ydl = worker._youtube_dl(site)
with brozzler.Browser(chrome_exe=args.chrome_exe) as browser:
worker.brozzle_page(browser, ydl, site, page)

View File

@ -142,7 +142,11 @@ class Browser:
try:
while True:
before_sleep = time.time()
time.sleep(0.5)
after_sleep = time.time()
if after_sleep - before_sleep > 1:
self.logger.warn("slept for %d seconds?!?!?! (should have been ~0.5)", (after_sleep - before_sleep))
if self._browse_interval_func():
return self._outlinks
finally:
@ -320,7 +324,7 @@ class Chrome:
while True:
try:
raw_json = urllib.request.urlopen(json_url).read()
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
all_debug_info = json.loads(raw_json.decode('utf-8'))
debug_info = [x for x in all_debug_info if x['url'] == 'about:blank']
@ -329,7 +333,8 @@ class Chrome:
url = debug_info[0]['webSocketDebuggerUrl']
self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url))
return url
except:
except BaseException as e:
self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e)
pass
finally:
if time.time() - self._start > timeout_sec:

View File

@ -24,8 +24,8 @@ class Site:
self.scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True)
def __repr__(self):
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
return """Site(id={},seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
self.id, repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
def note_seed_redirect(self, url):
new_scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(url)).getURLString(surt=True, trailing_comma=True)

View File

@ -104,10 +104,11 @@ class BrozzlerWorker:
except BaseException as e:
if hasattr(e, "exc_info") and youtube_dl.utils.UnsupportedError in e.exc_info:
pass
# elif hasattr(e, "exc_info") and youtube_dl.utils.UnsupportedError in e.exc_info:
else:
raise
def _brozzle_page(self, browser, ydl, site, page):
def brozzle_page(self, browser, ydl, site, page):
def on_screenshot(screenshot_png):
if site.proxy and site.enable_warcprox_features:
self.logger.info("sending PUTMETA request to warcprox with screenshot for {}".format(page))
@ -134,7 +135,7 @@ class BrozzlerWorker:
while not self._shutdown_requested.is_set() and time.time() - start < 60:
try:
page = self._next_page(site)
self._brozzle_page(browser, ydl, site, page)
self.brozzle_page(browser, ydl, site, page)
self._completed_page(site, page)
page = None
except kombu.simple.Empty: