mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-26 00:05:42 -04:00
brozzle-page --screenshot-full-page option
This commit is contained in:
parent
e5a3ada349
commit
65c7ccdcff
2 changed files with 17 additions and 10 deletions
|
@ -153,6 +153,9 @@ def brozzle_page(argv=None):
|
||||||
help='use this password to try to log in if a login form is found')
|
help='use this password to try to log in if a login form is found')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--screenshot-full-page', dest='screenshot_full_page',
|
||||||
|
action='store_true')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||||
action='store_true')
|
action='store_true')
|
||||||
|
@ -174,19 +177,20 @@ def brozzle_page(argv=None):
|
||||||
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
|
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
|
||||||
'username': args.username, 'password': args.password})
|
'username': args.username, 'password': args.password})
|
||||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
|
worker = brozzler.BrozzlerWorker(
|
||||||
|
frontier=None, proxy=args.proxy,
|
||||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
skip_youtube_dl=args.skip_youtube_dl)
|
skip_youtube_dl=args.skip_youtube_dl,
|
||||||
|
screenshot_full_page=args.screenshot_full_page)
|
||||||
|
|
||||||
def on_screenshot(screenshot_png):
|
def on_screenshot(screenshot_jpeg):
|
||||||
OK_CHARS = (string.ascii_letters + string.digits)
|
OK_CHARS = string.ascii_letters + string.digits
|
||||||
filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
|
filename = '/tmp/{}-{:%Y%m%d%H%M%S}.jpg'.format(
|
||||||
''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
|
''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
|
||||||
datetime.datetime.now())
|
datetime.datetime.now())
|
||||||
# logging.info('len(screenshot_png)=%s', len(screenshot_png))
|
|
||||||
with open(filename, 'wb') as f:
|
with open(filename, 'wb') as f:
|
||||||
f.write(screenshot_png)
|
f.write(screenshot_jpeg)
|
||||||
logging.info('wrote screenshot to %s', filename)
|
logging.info('wrote screenshot to %s', filename)
|
||||||
|
|
||||||
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
|
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
|
||||||
|
|
|
@ -50,7 +50,8 @@ class BrozzlerWorker:
|
||||||
self, frontier, service_registry=None, max_browsers=1,
|
self, frontier, service_registry=None, max_browsers=1,
|
||||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||||
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
skip_youtube_dl=False, screenshot_full_page=False,
|
||||||
|
page_timeout=300, behavior_timeout=900):
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
|
@ -62,6 +63,7 @@ class BrozzlerWorker:
|
||||||
self._skip_extract_outlinks = skip_extract_outlinks
|
self._skip_extract_outlinks = skip_extract_outlinks
|
||||||
self._skip_visit_hashtags = skip_visit_hashtags
|
self._skip_visit_hashtags = skip_visit_hashtags
|
||||||
self._skip_youtube_dl = skip_youtube_dl
|
self._skip_youtube_dl = skip_youtube_dl
|
||||||
|
self._screenshot_full_page = screenshot_full_page
|
||||||
self._page_timeout = page_timeout
|
self._page_timeout = page_timeout
|
||||||
self._behavior_timeout = behavior_timeout
|
self._behavior_timeout = behavior_timeout
|
||||||
|
|
||||||
|
@ -295,6 +297,7 @@ class BrozzlerWorker:
|
||||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||||
skip_youtube_dl=self._skip_youtube_dl,
|
skip_youtube_dl=self._skip_youtube_dl,
|
||||||
|
screenshot_full_page=self._screenshot_full_page,
|
||||||
page_timeout=self._page_timeout,
|
page_timeout=self._page_timeout,
|
||||||
behavior_timeout=self._behavior_timeout)
|
behavior_timeout=self._behavior_timeout)
|
||||||
if final_page_url != page.url:
|
if final_page_url != page.url:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue