mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-21 21:24:48 -04:00
save screenshots as metadata records using warcprox PUTMETA (same format as kenji's wide crawl)
This commit is contained in:
parent
5aea76ab6d
commit
923cd98652
2 changed files with 9 additions and 4 deletions
|
@ -25,9 +25,6 @@ args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||||
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||||
|
|
||||||
logging.info("brozzler-hq starting")
|
|
||||||
|
|
||||||
|
|
||||||
def sigterm(signum, frame):
|
def sigterm(signum, frame):
|
||||||
raise brozzler.ShutdownRequested("shutdown requested (caught SIGTERM)")
|
raise brozzler.ShutdownRequested("shutdown requested (caught SIGTERM)")
|
||||||
def sigint(signum, frame):
|
def sigint(signum, frame):
|
||||||
|
@ -36,6 +33,8 @@ def sigint(signum, frame):
|
||||||
signal.signal(signal.SIGTERM, sigterm)
|
signal.signal(signal.SIGTERM, sigterm)
|
||||||
signal.signal(signal.SIGINT, sigint)
|
signal.signal(signal.SIGINT, sigint)
|
||||||
|
|
||||||
|
logging.info("brozzler-hq starting")
|
||||||
|
|
||||||
db = brozzler.hq.BrozzlerHQDb(db_file=args.db_file)
|
db = brozzler.hq.BrozzlerHQDb(db_file=args.db_file)
|
||||||
hq = brozzler.hq.BrozzlerHQ(amqp_url=args.amqp_url, db=db)
|
hq = brozzler.hq.BrozzlerHQ(amqp_url=args.amqp_url, db=db)
|
||||||
|
|
||||||
|
|
|
@ -107,6 +107,11 @@ class BrozzlerWorker:
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def _on_screenshot(self, site, crawl_url, screenshot_png):
|
||||||
|
if self._proxy_server and self._enable_warcprox_features:
|
||||||
|
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
|
||||||
|
self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
|
||||||
|
|
||||||
def _brozzle_site(self, browser, site):
|
def _brozzle_site(self, browser, site):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
crawl_url = None
|
crawl_url = None
|
||||||
|
@ -117,7 +122,8 @@ class BrozzlerWorker:
|
||||||
crawl_url = self._next_url(site)
|
crawl_url = self._next_url(site)
|
||||||
logging.info("crawling {}".format(crawl_url))
|
logging.info("crawling {}".format(crawl_url))
|
||||||
self._try_youtube_dl(site, crawl_url)
|
self._try_youtube_dl(site, crawl_url)
|
||||||
crawl_url.outlinks = browser.browse_page(crawl_url.url)
|
crawl_url.outlinks = browser.browse_page(crawl_url.url,
|
||||||
|
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
|
||||||
self._completed_url(site, crawl_url)
|
self._completed_url(site, crawl_url)
|
||||||
crawl_url = None
|
crawl_url = None
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue