make brozzle-page utility save the screenshot to a file

This commit is contained in:
Noah Levitt 2016-05-04 11:53:45 -07:00
parent 87af7eaa73
commit 5a2ea2cea4
2 changed files with 20 additions and 5 deletions

View File

@ -26,6 +26,8 @@ import brozzler
import re
import warnings
import requests
import string
import datetime
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description="brozzle-page - brozzle a single page",
@ -63,10 +65,21 @@ page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None)
ydl = worker._youtube_dl(site)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
filename = "/tmp/{}-{:%Y%m%d%H%M%S}.png".format(
"".join(ch if ch in OK_CHARS else "_" for ch in args.url),
datetime.datetime.now())
# logging.info("len(screenshot_png)=%s", len(screenshot_png))
with open(filename, 'wb') as f:
f.write(screenshot_png)
logging.info("wrote screenshot to %s", filename)
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
browser.start(proxy=site.proxy)
try:
outlinks = worker.brozzle_page(browser, ydl, site, page)
outlinks = worker.brozzle_page(
browser, ydl, site, page, on_screenshot=on_screenshot)
logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks)))
except brozzler.ReachedLimit as e:
logging.error("reached limit %s", e)

View File

@ -194,9 +194,11 @@ class BrozzlerWorker:
return full_jpeg, thumb_jpeg
def brozzle_page(self, browser, ydl, site, page):
def on_screenshot(screenshot_png):
if site.proxy and site.enable_warcprox_features:
def brozzle_page(self, browser, ydl, site, page, on_screenshot=None):
def _on_screenshot(screenshot_png):
if on_screenshot:
on_screenshot(screenshot_png)
elif site.proxy and site.enable_warcprox_features:
self.logger.info("sending WARCPROX_WRITE_RECORD request "
"to warcprox with screenshot for %s", page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
@ -228,7 +230,7 @@ class BrozzlerWorker:
browser.start(proxy=site.proxy)
outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers,
on_screenshot=on_screenshot,
on_screenshot=_on_screenshot,
on_url_change=page.note_redirect)
return outlinks
else: