save screenshots as metadata records using warcprox PUTMETA (same format as kenji's wide crawl)

This commit is contained in:
Noah Levitt 2015-07-15 16:32:02 -07:00
parent 5aea76ab6d
commit 923cd98652
2 changed files with 9 additions and 4 deletions

View file

@ -107,6 +107,11 @@ class BrozzlerWorker:
else:
raise
def _on_screenshot(self, site, crawl_url, screenshot_png):
if self._proxy_server and self._enable_warcprox_features:
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
def _brozzle_site(self, browser, site):
start = time.time()
crawl_url = None
@ -117,7 +122,8 @@ class BrozzlerWorker:
crawl_url = self._next_url(site)
logging.info("crawling {}".format(crawl_url))
self._try_youtube_dl(site, crawl_url)
crawl_url.outlinks = browser.browse_page(crawl_url.url)
crawl_url.outlinks = browser.browse_page(crawl_url.url,
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
self._completed_url(site, crawl_url)
crawl_url = None
except kombu.simple.Empty: