diff --git a/brozzler/cli.py b/brozzler/cli.py index 2db020c..1cb5912 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -153,6 +153,9 @@ def brozzle_page(argv=None): help='use this password to try to log in if a login form is found') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') + arg_parser.add_argument( + '--screenshot-full-page', dest='screenshot_full_page', + action='store_true') arg_parser.add_argument( '--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true') @@ -174,19 +177,20 @@ def brozzle_page(argv=None): 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password}) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) - worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy, - skip_extract_outlinks=args.skip_extract_outlinks, - skip_visit_hashtags=args.skip_visit_hashtags, - skip_youtube_dl=args.skip_youtube_dl) + worker = brozzler.BrozzlerWorker( + frontier=None, proxy=args.proxy, + skip_extract_outlinks=args.skip_extract_outlinks, + skip_visit_hashtags=args.skip_visit_hashtags, + skip_youtube_dl=args.skip_youtube_dl, + screenshot_full_page=args.screenshot_full_page) - def on_screenshot(screenshot_png): - OK_CHARS = (string.ascii_letters + string.digits) - filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( + def on_screenshot(screenshot_jpeg): + OK_CHARS = string.ascii_letters + string.digits + filename = '/tmp/{}-{:%Y%m%d%H%M%S}.jpg'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) - # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: - f.write(screenshot_png) + f.write(screenshot_jpeg) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) diff --git a/brozzler/worker.py b/brozzler/worker.py index 6022dcb..4ef3121 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -50,7 +50,8 @@ class BrozzlerWorker: self, frontier, service_registry=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, skip_extract_outlinks=False, skip_visit_hashtags=False, - skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): + skip_youtube_dl=False, screenshot_full_page=False, + page_timeout=300, behavior_timeout=900): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -62,6 +63,7 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout self._behavior_timeout = behavior_timeout @@ -295,6 +297,7 @@ class BrozzlerWorker: skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, + screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout) if final_page_url != page.url: