Merge pull request #171 from vbanos/screenshot-full-screen

Add option to capture full page screenshot
This commit is contained in:
Noah Levitt 2019-10-09 16:27:05 -07:00 committed by GitHub
commit e5a3ada349
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 23 deletions

View File

@ -410,8 +410,9 @@ class Browser:
on_request=None, on_response=None, on_request=None, on_response=None,
on_service_worker_version_updated=None, on_screenshot=None, on_service_worker_version_updated=None, on_screenshot=None,
username=None, password=None, hashtags=None, username=None, password=None, hashtags=None,
skip_extract_outlinks=False, skip_visit_hashtags=False, screenshot_full_page=False, skip_extract_outlinks=False,
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300,
behavior_timeout=900):
''' '''
Browses page in browser. Browses page in browser.
@ -486,12 +487,12 @@ class Browser:
'login navigated away from %s; returning!', 'login navigated away from %s; returning!',
page_url) page_url)
self.navigate_to_page(page_url, timeout=page_timeout) self.navigate_to_page(page_url, timeout=page_timeout)
if on_screenshot:
self._try_screenshot(on_screenshot)
behavior_script = brozzler.behavior_script( behavior_script = brozzler.behavior_script(
page_url, behavior_parameters, page_url, behavior_parameters,
behaviors_dir=behaviors_dir) behaviors_dir=behaviors_dir)
self.run_behavior(behavior_script, timeout=behavior_timeout) self.run_behavior(behavior_script, timeout=behavior_timeout)
if on_screenshot:
self._try_screenshot(on_screenshot, screenshot_full_page)
if skip_extract_outlinks: if skip_extract_outlinks:
outlinks = [] outlinks = []
else: else:
@ -512,10 +513,15 @@ class Browser:
self.websock_thread.on_request = None self.websock_thread.on_request = None
self.websock_thread.on_response = None self.websock_thread.on_response = None
def _try_screenshot(self, on_screenshot): def _try_screenshot(self, on_screenshot, full_page=False):
"""The browser instance must be scrolled to the top of the page before
trying to get a screenshot.
"""
self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True,
params={'expression': 'window.scroll(0,0)'})
for i in range(3): for i in range(3):
try: try:
jpeg_bytes = self.screenshot() jpeg_bytes = self.screenshot(full_page)
on_screenshot(jpeg_bytes) on_screenshot(jpeg_bytes)
return return
except BrowsingTimeout as e: except BrowsingTimeout as e:
@ -591,10 +597,36 @@ class Browser:
'problem extracting outlinks, result message: %s', message) 'problem extracting outlinks, result message: %s', message)
return frozenset() return frozenset()
def screenshot(self, timeout=45): def screenshot(self, full_page=False, timeout=45):
"""Optionally capture full page screenshot using puppeteer as an
inspiration:
https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
"""
self.logger.info('taking screenshot') self.logger.info('taking screenshot')
if full_page:
self.websock_thread.expect_result(self._command_id.peek()) self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.captureScreenshot') msg_id = self.send_to_chrome(method='Page.getLayoutMetrics')
self._wait_for(
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
message = self.websock_thread.pop_result(msg_id)
width = message['result']['contentSize']['width']
height = message['result']['contentSize']['height']
clip = dict(x=0, y=0, width=width, height=height, scale=1)
deviceScaleFactor = 1
screenOrientation = {'angle': 0, 'type': 'portraitPrimary'}
self.send_to_chrome(
method='Emulation.setDeviceMetricsOverride',
params=dict(mobile=False, width=width, height=height,
deviceScaleFactor=deviceScaleFactor,
screenOrientation=screenOrientation)
)
capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
else:
capture_params = {'format': 'jpeg', 'quality': 95}
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.captureScreenshot',
params=capture_params)
self._wait_for( self._wait_for(
lambda: self.websock_thread.received_result(msg_id), lambda: self.websock_thread.received_result(msg_id),
timeout=timeout) timeout=timeout)

View File

@ -165,22 +165,16 @@ class BrozzlerWorker:
raise brozzler.ProxyError( raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
def full_and_thumb_jpegs(self, large_png): def thumb_jpeg(self, full_jpeg):
# these screenshots never have any alpha (right?) """Create JPEG thumbnail.
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB') """
img = PIL.Image.open(io.BytesIO(full_jpeg))
out = io.BytesIO()
img.save(out, "jpeg", quality=95)
full_jpeg = out.getbuffer()
thumb_width = 300 thumb_width = 300
thumb_height = (thumb_width / img.size[0]) * img.size[1] thumb_height = (thumb_width / img.size[0]) * img.size[1]
img.thumbnail((thumb_width, thumb_height)) img.thumbnail((thumb_width, thumb_height))
out = io.BytesIO() out = io.BytesIO()
img.save(out, "jpeg", quality=95) img.save(out, "jpeg", quality=95)
thumb_jpeg = out.getbuffer() return out.getbuffer()
return full_jpeg, thumb_jpeg
def brozzle_page(self, browser, site, page, on_screenshot=None, def brozzle_page(self, browser, site, page, on_screenshot=None,
on_request=None, enable_youtube_dl=True): on_request=None, enable_youtube_dl=True):
@ -226,15 +220,14 @@ class BrozzlerWorker:
return outlinks return outlinks
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_png): def _on_screenshot(screenshot_jpeg):
if on_screenshot: if on_screenshot:
on_screenshot(screenshot_png) on_screenshot(screenshot_jpeg)
if self._using_warcprox(site): if self._using_warcprox(site):
self.logger.info( self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with " "sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy_for(site), page) "screenshot for %s", self._proxy_for(site), page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
screenshot_png)
self._warcprox_write_record( self._warcprox_write_record(
warcprox_address=self._proxy_for(site), warcprox_address=self._proxy_for(site),
url="screenshot:%s" % str(urlcanon.semantic(page.url)), url="screenshot:%s" % str(urlcanon.semantic(page.url)),