mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
save screenshots as metadata records using warcprox PUTMETA (same format as kenji's wide crawl)
This commit is contained in:
parent
5aea76ab6d
commit
923cd98652
@ -25,9 +25,6 @@ args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||
|
||||
logging.info("brozzler-hq starting")
|
||||
|
||||
|
||||
def sigterm(signum, frame):
|
||||
raise brozzler.ShutdownRequested("shutdown requested (caught SIGTERM)")
|
||||
def sigint(signum, frame):
|
||||
@ -36,6 +33,8 @@ def sigint(signum, frame):
|
||||
signal.signal(signal.SIGTERM, sigterm)
|
||||
signal.signal(signal.SIGINT, sigint)
|
||||
|
||||
logging.info("brozzler-hq starting")
|
||||
|
||||
db = brozzler.hq.BrozzlerHQDb(db_file=args.db_file)
|
||||
hq = brozzler.hq.BrozzlerHQ(amqp_url=args.amqp_url, db=db)
|
||||
|
||||
|
@ -107,6 +107,11 @@ class BrozzlerWorker:
|
||||
else:
|
||||
raise
|
||||
|
||||
def _on_screenshot(self, site, crawl_url, screenshot_png):
|
||||
if self._proxy_server and self._enable_warcprox_features:
|
||||
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
|
||||
self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
|
||||
|
||||
def _brozzle_site(self, browser, site):
|
||||
start = time.time()
|
||||
crawl_url = None
|
||||
@ -117,7 +122,8 @@ class BrozzlerWorker:
|
||||
crawl_url = self._next_url(site)
|
||||
logging.info("crawling {}".format(crawl_url))
|
||||
self._try_youtube_dl(site, crawl_url)
|
||||
crawl_url.outlinks = browser.browse_page(crawl_url.url)
|
||||
crawl_url.outlinks = browser.browse_page(crawl_url.url,
|
||||
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
|
||||
self._completed_url(site, crawl_url)
|
||||
crawl_url = None
|
||||
except kombu.simple.Empty:
|
||||
|
Loading…
x
Reference in New Issue
Block a user