mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-07-26 00:05:42 -04:00
Fix JPEG thumbnail problems
Due to the fact that we run JS behaviors before we capture the screenshot, the browser could be scrolled down in the page. When we don't capture the full page, we may get a screenshot of the bottom part of the page and not the top. To fix that we run `window.scroll(0, 0)` before capturing the screenshot. We change method `BrozzlerWorker.full_and_thumb_jpegs` to `BrozzlerWorker.thumb_jpeg`. That's because we already get a JPEG now from the browser after our changes at `Browser.screenshot`. `thumb_jpeg` only returns a thumbnail now. There is no need to read PNG and convert to JPEG. This means that screenshots will be a bit faster now :)
This commit is contained in:
parent
674da4aa99
commit
ba901e3a99
2 changed files with 15 additions and 17 deletions
|
@ -514,6 +514,11 @@ class Browser:
|
||||||
self.websock_thread.on_response = None
|
self.websock_thread.on_response = None
|
||||||
|
|
||||||
def _try_screenshot(self, on_screenshot, full_page=False):
|
def _try_screenshot(self, on_screenshot, full_page=False):
|
||||||
|
"""The browser instance must be scrolled to the top of the page before
|
||||||
|
trying to get a screenshot.
|
||||||
|
"""
|
||||||
|
self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True,
|
||||||
|
params={'expression': 'window.scroll(0,0)'})
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
try:
|
try:
|
||||||
jpeg_bytes = self.screenshot(full_page)
|
jpeg_bytes = self.screenshot(full_page)
|
||||||
|
@ -616,9 +621,9 @@ class Browser:
|
||||||
deviceScaleFactor=deviceScaleFactor,
|
deviceScaleFactor=deviceScaleFactor,
|
||||||
screenOrientation=screenOrientation)
|
screenOrientation=screenOrientation)
|
||||||
)
|
)
|
||||||
capture_params = {'format': 'jpeg', quality: 95, 'clip': clip}
|
capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
|
||||||
else:
|
else:
|
||||||
capture_params = {'format': 'jpeg', quality: 95}
|
capture_params = {'format': 'jpeg', 'quality': 95}
|
||||||
self.websock_thread.expect_result(self._command_id.peek())
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
msg_id = self.send_to_chrome(method='Page.captureScreenshot',
|
msg_id = self.send_to_chrome(method='Page.captureScreenshot',
|
||||||
params=capture_params)
|
params=capture_params)
|
||||||
|
|
|
@ -165,22 +165,16 @@ class BrozzlerWorker:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
||||||
|
|
||||||
def full_and_thumb_jpegs(self, large_png):
|
def thumb_jpeg(self, full_jpeg):
|
||||||
# these screenshots never have any alpha (right?)
|
"""Create JPEG thumbnail.
|
||||||
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
|
"""
|
||||||
|
img = PIL.Image.open(io.BytesIO(full_jpeg))
|
||||||
out = io.BytesIO()
|
|
||||||
img.save(out, "jpeg", quality=95)
|
|
||||||
full_jpeg = out.getbuffer()
|
|
||||||
|
|
||||||
thumb_width = 300
|
thumb_width = 300
|
||||||
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
||||||
img.thumbnail((thumb_width, thumb_height))
|
img.thumbnail((thumb_width, thumb_height))
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
img.save(out, "jpeg", quality=95)
|
img.save(out, "jpeg", quality=95)
|
||||||
thumb_jpeg = out.getbuffer()
|
return out.getbuffer()
|
||||||
|
|
||||||
return full_jpeg, thumb_jpeg
|
|
||||||
|
|
||||||
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
||||||
on_request=None, enable_youtube_dl=True):
|
on_request=None, enable_youtube_dl=True):
|
||||||
|
@ -226,15 +220,14 @@ class BrozzlerWorker:
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_jpeg):
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_jpeg)
|
||||||
if self._using_warcprox(site):
|
if self._using_warcprox(site):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||||
"screenshot for %s", self._proxy_for(site), page)
|
"screenshot for %s", self._proxy_for(site), page)
|
||||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
|
||||||
screenshot_png)
|
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy_for(site),
|
warcprox_address=self._proxy_for(site),
|
||||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue