mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
change browse-url command to brozzle-page, which does some more stuff as if it were in brozzler, like youtube_dl, warcprox features, etc
This commit is contained in:
parent
8366bd2d66
commit
11fbbc9d49
@ -1,40 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# vim: set sw=4 et:
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import logging
|
|
||||||
import brozzler
|
|
||||||
import re
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
|
||||||
description='browse-url - open urls in chrome/chromium and run behaviors',
|
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
||||||
arg_parser.add_argument('urls', metavar='URL', nargs='+', help='URL(s) to browse')
|
|
||||||
arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60',
|
|
||||||
help='seconds to wait for browser initialization')
|
|
||||||
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
|
||||||
help='executable to use to invoke chrome')
|
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
|
||||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
|
||||||
arg_parser.add_argument('--version', action='version',
|
|
||||||
version="brozzler {} - {}".format(brozzler.version, os.path.basename(__file__)))
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
|
||||||
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
|
||||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
|
||||||
|
|
||||||
|
|
||||||
with brozzler.Browser(chrome_exe=args.chrome_exe) as browser:
|
|
||||||
for url in args.urls:
|
|
||||||
|
|
||||||
def on_screenshot(screenshot_png):
|
|
||||||
filename = "{}-{:%Y%m%d%H%M%S}.png".format(re.sub(r"\W", "_", url), datetime.datetime.now())
|
|
||||||
with open(filename, mode='wb') as png_out:
|
|
||||||
png_out.write(screenshot_png)
|
|
||||||
logging.info("wrote screenshot to {}".format(filename))
|
|
||||||
|
|
||||||
browser.browse_page(url, on_screenshot=on_screenshot)
|
|
||||||
|
|
46
bin/brozzle-page
Executable file
46
bin/brozzle-page
Executable file
@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim: set sw=4 et:
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import brozzler
|
||||||
|
import kombu
|
||||||
|
import re
|
||||||
|
|
||||||
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
|
description="brozzle-page - brozzle a single page",
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
arg_parser.add_argument('url', metavar='URL', help='page url')
|
||||||
|
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
||||||
|
help='executable to use to invoke chrome')
|
||||||
|
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
|
||||||
|
arg_parser.add_argument("-H", "--extra-header", action="append",
|
||||||
|
dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)")
|
||||||
|
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
|
||||||
|
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
|
||||||
|
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
||||||
|
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||||
|
arg_parser.add_argument("--version", action="version",
|
||||||
|
version="brozzler {} - {}".format(brozzler.version, os.path.basename(__file__)))
|
||||||
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
|
||||||
|
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||||
|
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||||
|
|
||||||
|
extra_headers = {}
|
||||||
|
if args.extra_headers:
|
||||||
|
for hh in args.extra_headers:
|
||||||
|
[k,v] = re.split(r":\s*", hh, 1)
|
||||||
|
extra_headers[k] = v
|
||||||
|
|
||||||
|
site = brozzler.Site(id=-1, seed=args.url, proxy=args.proxy,
|
||||||
|
enable_warcprox_features=args.enable_warcprox_features,
|
||||||
|
extra_headers=extra_headers)
|
||||||
|
page = brozzler.Page(url=args.url, site_id=site.id)
|
||||||
|
worker = brozzler.BrozzlerWorker()
|
||||||
|
ydl = worker._youtube_dl(site)
|
||||||
|
|
||||||
|
with brozzler.Browser(chrome_exe=args.chrome_exe) as browser:
|
||||||
|
worker.brozzle_page(browser, ydl, site, page)
|
@ -142,7 +142,11 @@ class Browser:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
|
before_sleep = time.time()
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
after_sleep = time.time()
|
||||||
|
if after_sleep - before_sleep > 1:
|
||||||
|
self.logger.warn("slept for %d seconds?!?!?! (should have been ~0.5)", (after_sleep - before_sleep))
|
||||||
if self._browse_interval_func():
|
if self._browse_interval_func():
|
||||||
return self._outlinks
|
return self._outlinks
|
||||||
finally:
|
finally:
|
||||||
@ -320,7 +324,7 @@ class Chrome:
|
|||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
raw_json = urllib.request.urlopen(json_url).read()
|
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
|
||||||
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
all_debug_info = json.loads(raw_json.decode('utf-8'))
|
||||||
debug_info = [x for x in all_debug_info if x['url'] == 'about:blank']
|
debug_info = [x for x in all_debug_info if x['url'] == 'about:blank']
|
||||||
|
|
||||||
@ -329,7 +333,8 @@ class Chrome:
|
|||||||
url = debug_info[0]['webSocketDebuggerUrl']
|
url = debug_info[0]['webSocketDebuggerUrl']
|
||||||
self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url))
|
self.logger.info('got chrome window websocket debug url {} from {}'.format(url, json_url))
|
||||||
return url
|
return url
|
||||||
except:
|
except BaseException as e:
|
||||||
|
self.logger.warn("problem with %s (will keep trying until timeout of %d seconds): %s", json_url, timeout_sec, e)
|
||||||
pass
|
pass
|
||||||
finally:
|
finally:
|
||||||
if time.time() - self._start > timeout_sec:
|
if time.time() - self._start > timeout_sec:
|
||||||
|
@ -24,8 +24,8 @@ class Site:
|
|||||||
self.scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True)
|
self.scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(seed)).getURLString(surt=True, trailing_comma=True)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
return """Site(id={},seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
||||||
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
self.id, repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
new_scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(url)).getURLString(surt=True, trailing_comma=True)
|
new_scope_surt = surt.GoogleURLCanonicalizer.canonicalize(surt.handyurl.parse(url)).getURLString(surt=True, trailing_comma=True)
|
||||||
|
@ -104,10 +104,11 @@ class BrozzlerWorker:
|
|||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
if hasattr(e, "exc_info") and youtube_dl.utils.UnsupportedError in e.exc_info:
|
if hasattr(e, "exc_info") and youtube_dl.utils.UnsupportedError in e.exc_info:
|
||||||
pass
|
pass
|
||||||
|
# elif hasattr(e, "exc_info") and youtube_dl.utils.UnsupportedError in e.exc_info:
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def _brozzle_page(self, browser, ydl, site, page):
|
def brozzle_page(self, browser, ydl, site, page):
|
||||||
def on_screenshot(screenshot_png):
|
def on_screenshot(screenshot_png):
|
||||||
if site.proxy and site.enable_warcprox_features:
|
if site.proxy and site.enable_warcprox_features:
|
||||||
self.logger.info("sending PUTMETA request to warcprox with screenshot for {}".format(page))
|
self.logger.info("sending PUTMETA request to warcprox with screenshot for {}".format(page))
|
||||||
@ -134,7 +135,7 @@ class BrozzlerWorker:
|
|||||||
while not self._shutdown_requested.is_set() and time.time() - start < 60:
|
while not self._shutdown_requested.is_set() and time.time() - start < 60:
|
||||||
try:
|
try:
|
||||||
page = self._next_page(site)
|
page = self._next_page(site)
|
||||||
self._brozzle_page(browser, ydl, site, page)
|
self.brozzle_page(browser, ydl, site, page)
|
||||||
self._completed_page(site, page)
|
self._completed_page(site, page)
|
||||||
page = None
|
page = None
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user