Fix JPEG thumbnail problems

Due to the fact that we run JS behaviors before we capture the
screenshot, the browser could be scrolled down in the page. When we
don't capture the full page, we may get a screenshot of the bottom part of
the page and not the top. To fix that we run `window.scroll(0, 0)`
before capturing the screenshot.

We change method `BrozzlerWorker.full_and_thumb_jpegs` to
`BrozzlerWorker.thumb_jpeg`. That's because we already get a JPEG now
from the browser after our changes at `Browser.screenshot`.

`thumb_jpeg` only returns a thumbnail now. There is no need to read PNG
and convert to JPEG. This means that screenshots will be a bit faster
now :)
This commit is contained in:
Vangelis Banos 2019-10-09 13:34:38 +00:00
parent 674da4aa99
commit ba901e3a99
2 changed files with 15 additions and 17 deletions

View File

@ -514,6 +514,11 @@ class Browser:
self.websock_thread.on_response = None
def _try_screenshot(self, on_screenshot, full_page=False):
"""The browser instance must be scrolled to the top of the page before
trying to get a screenshot.
"""
self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True,
params={'expression': 'window.scroll(0,0)'})
for i in range(3):
try:
jpeg_bytes = self.screenshot(full_page)
@ -616,9 +621,9 @@ class Browser:
deviceScaleFactor=deviceScaleFactor,
screenOrientation=screenOrientation)
)
capture_params = {'format': 'jpeg', quality: 95, 'clip': clip}
capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
else:
capture_params = {'format': 'jpeg', quality: 95}
capture_params = {'format': 'jpeg', 'quality': 95}
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.captureScreenshot',
params=capture_params)

View File

@ -165,22 +165,16 @@ class BrozzlerWorker:
raise brozzler.ProxyError(
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
def full_and_thumb_jpegs(self, large_png):
# these screenshots never have any alpha (right?)
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
out = io.BytesIO()
img.save(out, "jpeg", quality=95)
full_jpeg = out.getbuffer()
def thumb_jpeg(self, full_jpeg):
"""Create JPEG thumbnail.
"""
img = PIL.Image.open(io.BytesIO(full_jpeg))
thumb_width = 300
thumb_height = (thumb_width / img.size[0]) * img.size[1]
img.thumbnail((thumb_width, thumb_height))
out = io.BytesIO()
img.save(out, "jpeg", quality=95)
thumb_jpeg = out.getbuffer()
return full_jpeg, thumb_jpeg
return out.getbuffer()
def brozzle_page(self, browser, site, page, on_screenshot=None,
on_request=None, enable_youtube_dl=True):
@ -226,15 +220,14 @@ class BrozzlerWorker:
return outlinks
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_png):
def _on_screenshot(screenshot_jpeg):
if on_screenshot:
on_screenshot(screenshot_png)
on_screenshot(screenshot_jpeg)
if self._using_warcprox(site):
self.logger.info(
"sending WARCPROX_WRITE_RECORD request to %s with "
"screenshot for %s", self._proxy_for(site), page)
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
screenshot_png)
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
self._warcprox_write_record(
warcprox_address=self._proxy_for(site),
url="screenshot:%s" % str(urlcanon.semantic(page.url)),