save screenshots as metadata records using warcprox PUTMETA (same format as kenji's wide crawl)

This commit is contained in:
Noah Levitt 2015-07-15 16:32:02 -07:00
parent 5aea76ab6d
commit 923cd98652
2 changed files with 9 additions and 4 deletions

View File

@ -25,9 +25,6 @@ args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
logging.info("brozzler-hq starting")
def sigterm(signum, frame):
raise brozzler.ShutdownRequested("shutdown requested (caught SIGTERM)")
def sigint(signum, frame):
@ -36,6 +33,8 @@ def sigint(signum, frame):
signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint)
logging.info("brozzler-hq starting")
db = brozzler.hq.BrozzlerHQDb(db_file=args.db_file)
hq = brozzler.hq.BrozzlerHQ(amqp_url=args.amqp_url, db=db)

View File

@ -107,6 +107,11 @@ class BrozzlerWorker:
else:
raise
def _on_screenshot(self, site, crawl_url, screenshot_png):
if self._proxy_server and self._enable_warcprox_features:
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
def _brozzle_site(self, browser, site):
start = time.time()
crawl_url = None
@ -117,7 +122,8 @@ class BrozzlerWorker:
crawl_url = self._next_url(site)
logging.info("crawling {}".format(crawl_url))
self._try_youtube_dl(site, crawl_url)
crawl_url.outlinks = browser.browse_page(crawl_url.url)
crawl_url.outlinks = browser.browse_page(crawl_url.url,
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
self._completed_url(site, crawl_url)
crawl_url = None
except kombu.simple.Empty: