brozzle-page --screenshot-full-page option

This commit is contained in:
Noah Levitt 2019-10-09 16:28:26 -07:00
parent e5a3ada349
commit 65c7ccdcff
2 changed files with 17 additions and 10 deletions

View File

@ -153,6 +153,9 @@ def brozzle_page(argv=None):
help='use this password to try to log in if a login form is found')
arg_parser.add_argument(
'--proxy', dest='proxy', default=None, help='http proxy')
arg_parser.add_argument(
'--screenshot-full-page', dest='screenshot_full_page',
action='store_true')
arg_parser.add_argument(
'--skip-extract-outlinks', dest='skip_extract_outlinks',
action='store_true')
@ -174,19 +177,20 @@ def brozzle_page(argv=None):
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
'username': args.username, 'password': args.password})
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
worker = brozzler.BrozzlerWorker(
frontier=None, proxy=args.proxy,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl,
screenshot_full_page=args.screenshot_full_page)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
def on_screenshot(screenshot_jpeg):
OK_CHARS = string.ascii_letters + string.digits
filename = '/tmp/{}-{:%Y%m%d%H%M%S}.jpg'.format(
''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
datetime.datetime.now())
# logging.info('len(screenshot_png)=%s', len(screenshot_png))
with open(filename, 'wb') as f:
f.write(screenshot_png)
f.write(screenshot_jpeg)
logging.info('wrote screenshot to %s', filename)
browser = brozzler.Browser(chrome_exe=args.chrome_exe)

View File

@ -50,7 +50,8 @@ class BrozzlerWorker:
self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
skip_youtube_dl=False, screenshot_full_page=False,
page_timeout=300, behavior_timeout=900):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -62,6 +63,7 @@ class BrozzlerWorker:
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
self._screenshot_full_page = screenshot_full_page
self._page_timeout = page_timeout
self._behavior_timeout = behavior_timeout
@ -295,6 +297,7 @@ class BrozzlerWorker:
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl,
screenshot_full_page=self._screenshot_full_page,
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout)
if final_page_url != page.url: