mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #171 from vbanos/screenshot-full-screen
Add option to capture full page screenshot
This commit is contained in:
commit
e5a3ada349
@ -410,8 +410,9 @@ class Browser:
|
|||||||
on_request=None, on_response=None,
|
on_request=None, on_response=None,
|
||||||
on_service_worker_version_updated=None, on_screenshot=None,
|
on_service_worker_version_updated=None, on_screenshot=None,
|
||||||
username=None, password=None, hashtags=None,
|
username=None, password=None, hashtags=None,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
screenshot_full_page=False, skip_extract_outlinks=False,
|
||||||
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300,
|
||||||
|
behavior_timeout=900):
|
||||||
'''
|
'''
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
@ -486,12 +487,12 @@ class Browser:
|
|||||||
'login navigated away from %s; returning!',
|
'login navigated away from %s; returning!',
|
||||||
page_url)
|
page_url)
|
||||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||||
if on_screenshot:
|
|
||||||
self._try_screenshot(on_screenshot)
|
|
||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters,
|
page_url, behavior_parameters,
|
||||||
behaviors_dir=behaviors_dir)
|
behaviors_dir=behaviors_dir)
|
||||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||||
|
if on_screenshot:
|
||||||
|
self._try_screenshot(on_screenshot, screenshot_full_page)
|
||||||
if skip_extract_outlinks:
|
if skip_extract_outlinks:
|
||||||
outlinks = []
|
outlinks = []
|
||||||
else:
|
else:
|
||||||
@ -512,10 +513,15 @@ class Browser:
|
|||||||
self.websock_thread.on_request = None
|
self.websock_thread.on_request = None
|
||||||
self.websock_thread.on_response = None
|
self.websock_thread.on_response = None
|
||||||
|
|
||||||
def _try_screenshot(self, on_screenshot):
|
def _try_screenshot(self, on_screenshot, full_page=False):
|
||||||
|
"""The browser instance must be scrolled to the top of the page before
|
||||||
|
trying to get a screenshot.
|
||||||
|
"""
|
||||||
|
self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True,
|
||||||
|
params={'expression': 'window.scroll(0,0)'})
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
try:
|
try:
|
||||||
jpeg_bytes = self.screenshot()
|
jpeg_bytes = self.screenshot(full_page)
|
||||||
on_screenshot(jpeg_bytes)
|
on_screenshot(jpeg_bytes)
|
||||||
return
|
return
|
||||||
except BrowsingTimeout as e:
|
except BrowsingTimeout as e:
|
||||||
@ -591,10 +597,36 @@ class Browser:
|
|||||||
'problem extracting outlinks, result message: %s', message)
|
'problem extracting outlinks, result message: %s', message)
|
||||||
return frozenset()
|
return frozenset()
|
||||||
|
|
||||||
def screenshot(self, timeout=45):
|
def screenshot(self, full_page=False, timeout=45):
|
||||||
|
"""Optionally capture full page screenshot using puppeteer as an
|
||||||
|
inspiration:
|
||||||
|
https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
|
||||||
|
"""
|
||||||
self.logger.info('taking screenshot')
|
self.logger.info('taking screenshot')
|
||||||
|
if full_page:
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(method='Page.captureScreenshot')
|
msg_id = self.send_to_chrome(method='Page.getLayoutMetrics')
|
||||||
|
self._wait_for(
|
||||||
|
lambda: self.websock_thread.received_result(msg_id),
|
||||||
|
timeout=timeout)
|
||||||
|
message = self.websock_thread.pop_result(msg_id)
|
||||||
|
width = message['result']['contentSize']['width']
|
||||||
|
height = message['result']['contentSize']['height']
|
||||||
|
clip = dict(x=0, y=0, width=width, height=height, scale=1)
|
||||||
|
deviceScaleFactor = 1
|
||||||
|
screenOrientation = {'angle': 0, 'type': 'portraitPrimary'}
|
||||||
|
self.send_to_chrome(
|
||||||
|
method='Emulation.setDeviceMetricsOverride',
|
||||||
|
params=dict(mobile=False, width=width, height=height,
|
||||||
|
deviceScaleFactor=deviceScaleFactor,
|
||||||
|
screenOrientation=screenOrientation)
|
||||||
|
)
|
||||||
|
capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
|
||||||
|
else:
|
||||||
|
capture_params = {'format': 'jpeg', 'quality': 95}
|
||||||
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
|
msg_id = self.send_to_chrome(method='Page.captureScreenshot',
|
||||||
|
params=capture_params)
|
||||||
self._wait_for(
|
self._wait_for(
|
||||||
lambda: self.websock_thread.received_result(msg_id),
|
lambda: self.websock_thread.received_result(msg_id),
|
||||||
timeout=timeout)
|
timeout=timeout)
|
||||||
|
@ -165,22 +165,16 @@ class BrozzlerWorker:
|
|||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
||||||
|
|
||||||
def full_and_thumb_jpegs(self, large_png):
|
def thumb_jpeg(self, full_jpeg):
|
||||||
# these screenshots never have any alpha (right?)
|
"""Create JPEG thumbnail.
|
||||||
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
|
"""
|
||||||
|
img = PIL.Image.open(io.BytesIO(full_jpeg))
|
||||||
out = io.BytesIO()
|
|
||||||
img.save(out, "jpeg", quality=95)
|
|
||||||
full_jpeg = out.getbuffer()
|
|
||||||
|
|
||||||
thumb_width = 300
|
thumb_width = 300
|
||||||
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
||||||
img.thumbnail((thumb_width, thumb_height))
|
img.thumbnail((thumb_width, thumb_height))
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
img.save(out, "jpeg", quality=95)
|
img.save(out, "jpeg", quality=95)
|
||||||
thumb_jpeg = out.getbuffer()
|
return out.getbuffer()
|
||||||
|
|
||||||
return full_jpeg, thumb_jpeg
|
|
||||||
|
|
||||||
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
||||||
on_request=None, enable_youtube_dl=True):
|
on_request=None, enable_youtube_dl=True):
|
||||||
@ -226,15 +220,14 @@ class BrozzlerWorker:
|
|||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_jpeg):
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_jpeg)
|
||||||
if self._using_warcprox(site):
|
if self._using_warcprox(site):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||||
"screenshot for %s", self._proxy_for(site), page)
|
"screenshot for %s", self._proxy_for(site), page)
|
||||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
|
||||||
screenshot_png)
|
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy_for(site),
|
warcprox_address=self._proxy_for(site),
|
||||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user