diff --git a/bin/brozzle-page b/bin/brozzle-page index d51aaf5..e5e89bb 100755 --- a/bin/brozzle-page +++ b/bin/brozzle-page @@ -26,6 +26,8 @@ import brozzler import re import warnings import requests +import string +import datetime arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), description="brozzle-page - brozzle a single page", @@ -63,10 +65,21 @@ page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) ydl = worker._youtube_dl(site) +def on_screenshot(screenshot_png): + OK_CHARS = (string.ascii_letters + string.digits) + filename = "/tmp/{}-{:%Y%m%d%H%M%S}.png".format( + "".join(ch if ch in OK_CHARS else "_" for ch in args.url), + datetime.datetime.now()) + # logging.info("len(screenshot_png)=%s", len(screenshot_png)) + with open(filename, 'wb') as f: + f.write(screenshot_png) + logging.info("wrote screenshot to %s", filename) + browser = brozzler.Browser(chrome_exe=args.chrome_exe) browser.start(proxy=site.proxy) try: - outlinks = worker.brozzle_page(browser, ydl, site, page) + outlinks = worker.brozzle_page( + browser, ydl, site, page, on_screenshot=on_screenshot) logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error("reached limit %s", e) diff --git a/brozzler/worker.py b/brozzler/worker.py index c3ee4b5..1ec3bb9 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -194,9 +194,11 @@ class BrozzlerWorker: return full_jpeg, thumb_jpeg - def brozzle_page(self, browser, ydl, site, page): - def on_screenshot(screenshot_png): - if site.proxy and site.enable_warcprox_features: + def brozzle_page(self, browser, ydl, site, page, on_screenshot=None): + def _on_screenshot(screenshot_png): + if on_screenshot: + on_screenshot(screenshot_png) + elif site.proxy and site.enable_warcprox_features: self.logger.info("sending WARCPROX_WRITE_RECORD request " "to warcprox with screenshot for %s", page) screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( @@ -228,7 +230,7 @@ class BrozzlerWorker: browser.start(proxy=site.proxy) outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers, - on_screenshot=on_screenshot, + on_screenshot=_on_screenshot, on_url_change=page.note_redirect) return outlinks else: