mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge remote-tracking branch 'upstream/master' into qa
This commit is contained in:
commit
3c5f0e25ff
@ -1,7 +1,6 @@
|
||||
dist: xenial
|
||||
language: python
|
||||
python:
|
||||
- 3.4
|
||||
- 3.5
|
||||
- 3.6
|
||||
- 3.7
|
||||
@ -24,6 +23,8 @@ script:
|
||||
- DISPLAY=:1 py.test --tb=native -v tests
|
||||
after_failure:
|
||||
- chromium-browser --version
|
||||
- sudo kill -QUIT $(sudo svstat /etc/service/warcprox | egrep -o 'pid [0-9]+' | awk '{print $2}')
|
||||
- sudo kill -QUIT $(sudo svstat /etc/service/brozzler-worker | egrep -o 'pid [0-9]+' | awk '{print $2}')
|
||||
- sudo cat /var/log/warcprox.log
|
||||
- sudo cat /var/log/brozzler-worker.log
|
||||
- sudo cat /var/log/pywb.log
|
||||
|
@ -19,7 +19,7 @@ Brozzler is designed to work in conjuction with warcprox for web archiving.
|
||||
Requirements
|
||||
------------
|
||||
|
||||
- Python 3.4 or later
|
||||
- Python 3.5 or later
|
||||
- RethinkDB deployment
|
||||
- Chromium or Google Chrome >= version 64
|
||||
|
||||
|
@ -361,8 +361,15 @@ class Browser:
|
||||
# disable google analytics and amp analytics
|
||||
self.send_to_chrome(
|
||||
method='Network.setBlockedURLs',
|
||||
params={'urls': ['*google-analytics.com/analytics.js',
|
||||
'*google-analytics.com/ga.js',
|
||||
params={'urls': ['*google-analytics.com/analytics.js*',
|
||||
'*google-analytics.com/ga.js*',
|
||||
'*google-analytics.com/ga_exp.js*',
|
||||
'*google-analytics.com/urchin.js*',
|
||||
'*google-analytics.com/collect*',
|
||||
'*google-analytics.com/r/collect*',
|
||||
'*google-analytics.com/__utm.gif*',
|
||||
'*google-analytics.com/gtm/js?*',
|
||||
'*google-analytics.com/cx/api.js*',
|
||||
'*cdn.ampproject.org/*/amp-analytics*.js']})
|
||||
|
||||
def stop(self):
|
||||
@ -410,8 +417,9 @@ class Browser:
|
||||
on_request=None, on_response=None,
|
||||
on_service_worker_version_updated=None, on_screenshot=None,
|
||||
username=None, password=None, hashtags=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
||||
screenshot_full_page=False, skip_extract_outlinks=False,
|
||||
skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300,
|
||||
behavior_timeout=900):
|
||||
'''
|
||||
Browses page in browser.
|
||||
|
||||
@ -486,12 +494,12 @@ class Browser:
|
||||
'login navigated away from %s; returning!',
|
||||
page_url)
|
||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||
if on_screenshot:
|
||||
self._try_screenshot(on_screenshot)
|
||||
behavior_script = brozzler.behavior_script(
|
||||
page_url, behavior_parameters,
|
||||
behaviors_dir=behaviors_dir)
|
||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||
if on_screenshot:
|
||||
self._try_screenshot(on_screenshot, screenshot_full_page)
|
||||
if skip_extract_outlinks:
|
||||
outlinks = []
|
||||
else:
|
||||
@ -512,10 +520,15 @@ class Browser:
|
||||
self.websock_thread.on_request = None
|
||||
self.websock_thread.on_response = None
|
||||
|
||||
def _try_screenshot(self, on_screenshot):
|
||||
def _try_screenshot(self, on_screenshot, full_page=False):
|
||||
"""The browser instance must be scrolled to the top of the page before
|
||||
trying to get a screenshot.
|
||||
"""
|
||||
self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True,
|
||||
params={'expression': 'window.scroll(0,0)'})
|
||||
for i in range(3):
|
||||
try:
|
||||
jpeg_bytes = self.screenshot()
|
||||
jpeg_bytes = self.screenshot(full_page)
|
||||
on_screenshot(jpeg_bytes)
|
||||
return
|
||||
except BrowsingTimeout as e:
|
||||
@ -581,8 +594,8 @@ class Browser:
|
||||
if ('result' in message and 'result' in message['result']
|
||||
and 'value' in message['result']['result']):
|
||||
if message['result']['result']['value']:
|
||||
return frozenset(
|
||||
message['result']['result']['value'].split('\n'))
|
||||
return frozenset([str(urlcanon.whatwg(link)) for link in
|
||||
message['result']['result']['value'].split('\n')])
|
||||
else:
|
||||
# no links found
|
||||
return frozenset()
|
||||
@ -591,10 +604,36 @@ class Browser:
|
||||
'problem extracting outlinks, result message: %s', message)
|
||||
return frozenset()
|
||||
|
||||
def screenshot(self, timeout=45):
|
||||
def screenshot(self, full_page=False, timeout=45):
|
||||
"""Optionally capture full page screenshot using puppeteer as an
|
||||
inspiration:
|
||||
https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898
|
||||
"""
|
||||
self.logger.info('taking screenshot')
|
||||
if full_page:
|
||||
self.websock_thread.expect_result(self._command_id.peek())
|
||||
msg_id = self.send_to_chrome(method='Page.getLayoutMetrics')
|
||||
self._wait_for(
|
||||
lambda: self.websock_thread.received_result(msg_id),
|
||||
timeout=timeout)
|
||||
message = self.websock_thread.pop_result(msg_id)
|
||||
width = message['result']['contentSize']['width']
|
||||
height = message['result']['contentSize']['height']
|
||||
clip = dict(x=0, y=0, width=width, height=height, scale=1)
|
||||
deviceScaleFactor = 1
|
||||
screenOrientation = {'angle': 0, 'type': 'portraitPrimary'}
|
||||
self.send_to_chrome(
|
||||
method='Emulation.setDeviceMetricsOverride',
|
||||
params=dict(mobile=False, width=width, height=height,
|
||||
deviceScaleFactor=deviceScaleFactor,
|
||||
screenOrientation=screenOrientation)
|
||||
)
|
||||
capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip}
|
||||
else:
|
||||
capture_params = {'format': 'jpeg', 'quality': 95}
|
||||
self.websock_thread.expect_result(self._command_id.peek())
|
||||
msg_id = self.send_to_chrome(method='Page.captureScreenshot')
|
||||
msg_id = self.send_to_chrome(method='Page.captureScreenshot',
|
||||
params=capture_params)
|
||||
self._wait_for(
|
||||
lambda: self.websock_thread.received_result(msg_id),
|
||||
timeout=timeout)
|
||||
|
@ -153,6 +153,9 @@ def brozzle_page(argv=None):
|
||||
help='use this password to try to log in if a login form is found')
|
||||
arg_parser.add_argument(
|
||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||
arg_parser.add_argument(
|
||||
'--screenshot-full-page', dest='screenshot_full_page',
|
||||
action='store_true')
|
||||
arg_parser.add_argument(
|
||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||
action='store_true')
|
||||
@ -174,19 +177,20 @@ def brozzle_page(argv=None):
|
||||
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
|
||||
'username': args.username, 'password': args.password})
|
||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||
skip_youtube_dl=args.skip_youtube_dl)
|
||||
worker = brozzler.BrozzlerWorker(
|
||||
frontier=None, proxy=args.proxy,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||
skip_youtube_dl=args.skip_youtube_dl,
|
||||
screenshot_full_page=args.screenshot_full_page)
|
||||
|
||||
def on_screenshot(screenshot_png):
|
||||
OK_CHARS = (string.ascii_letters + string.digits)
|
||||
filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
|
||||
def on_screenshot(screenshot_jpeg):
|
||||
OK_CHARS = string.ascii_letters + string.digits
|
||||
filename = '/tmp/{}-{:%Y%m%d%H%M%S}.jpg'.format(
|
||||
''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
|
||||
datetime.datetime.now())
|
||||
# logging.info('len(screenshot_png)=%s', len(screenshot_png))
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(screenshot_png)
|
||||
f.write(screenshot_jpeg)
|
||||
logging.info('wrote screenshot to %s', filename)
|
||||
|
||||
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
|
||||
|
@ -314,7 +314,7 @@ class RethinkDbFrontier:
|
||||
'''
|
||||
existing_page.priority += fresh_page.priority
|
||||
existing_page.hashtags = list(set(
|
||||
existing_page.hashtags + fresh_page.hashtags))
|
||||
(existing_page.hashtags or []) + (fresh_page.hashtags or [])))
|
||||
existing_page.hops_off = min(
|
||||
existing_page.hops_off, fresh_page.hops_off)
|
||||
|
||||
@ -375,14 +375,18 @@ class RethinkDbFrontier:
|
||||
decisions['accepted'].add(fresh_page.url)
|
||||
if fresh_page.id in pages:
|
||||
page = pages[fresh_page.id]
|
||||
page.hashtags = list(set((page.hashtags or [])
|
||||
+ fresh_page.hashtags))
|
||||
page.priority += fresh_page.priority
|
||||
self._merge_page(page, fresh_page)
|
||||
counts['updated'] += 1
|
||||
else:
|
||||
pages[fresh_page.id] = fresh_page
|
||||
counts['added'] += 1
|
||||
|
||||
# make sure we're not stepping on our own toes in case we have a link
|
||||
# back to parent_page, which I think happens because of hashtags
|
||||
if parent_page.id in pages:
|
||||
self._merge_page(parent_page, pages[parent_page.id])
|
||||
del pages[parent_page.id]
|
||||
|
||||
# insert/replace in batches of 50 to try to avoid this error:
|
||||
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
||||
# there can be many pages and each one can be very large (many videos,
|
||||
@ -392,8 +396,11 @@ class RethinkDbFrontier:
|
||||
try:
|
||||
self.logger.debug(
|
||||
'inserting/replacing batch of %s pages', len(batch))
|
||||
result = self.rr.table('pages').insert(
|
||||
batch, conflict='replace').run()
|
||||
reql = self.rr.table('pages').insert(batch, conflict='replace')
|
||||
self.logger.trace(
|
||||
'running query self.rr.table("pages").insert(%r, '
|
||||
'conflict="replace")', batch)
|
||||
result = reql.run()
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
'problem inserting/replacing batch of %s pages',
|
||||
@ -450,12 +457,15 @@ class RethinkDbFrontier:
|
||||
Returns:
|
||||
iterator of brozzler.Page
|
||||
'''
|
||||
results = self.rr.table("pages").between(
|
||||
query = self.rr.table("pages").between(
|
||||
[site_id, 1 if brozzled is True else 0,
|
||||
r.minval, r.minval],
|
||||
[site_id, 0 if brozzled is False else r.maxval,
|
||||
r.maxval, r.maxval],
|
||||
index="priority_by_site").run()
|
||||
index="priority_by_site")
|
||||
self.logger.trace("running query: %r", query)
|
||||
results = query.run()
|
||||
for result in results:
|
||||
self.logger.trace("yielding result: %r", result)
|
||||
yield brozzler.Page(self.rr, result)
|
||||
|
||||
|
@ -50,7 +50,8 @@ class BrozzlerWorker:
|
||||
self, frontier, service_registry=None, max_browsers=1,
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
||||
skip_youtube_dl=False, screenshot_full_page=False,
|
||||
page_timeout=300, behavior_timeout=900):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
@ -62,6 +63,7 @@ class BrozzlerWorker:
|
||||
self._skip_extract_outlinks = skip_extract_outlinks
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
self._skip_youtube_dl = skip_youtube_dl
|
||||
self._screenshot_full_page = screenshot_full_page
|
||||
self._page_timeout = page_timeout
|
||||
self._behavior_timeout = behavior_timeout
|
||||
|
||||
@ -165,22 +167,16 @@ class BrozzlerWorker:
|
||||
raise brozzler.ProxyError(
|
||||
'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e
|
||||
|
||||
def full_and_thumb_jpegs(self, large_png):
|
||||
# these screenshots never have any alpha (right?)
|
||||
img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB')
|
||||
|
||||
out = io.BytesIO()
|
||||
img.save(out, "jpeg", quality=95)
|
||||
full_jpeg = out.getbuffer()
|
||||
|
||||
def thumb_jpeg(self, full_jpeg):
|
||||
"""Create JPEG thumbnail.
|
||||
"""
|
||||
img = PIL.Image.open(io.BytesIO(full_jpeg))
|
||||
thumb_width = 300
|
||||
thumb_height = (thumb_width / img.size[0]) * img.size[1]
|
||||
img.thumbnail((thumb_width, thumb_height))
|
||||
out = io.BytesIO()
|
||||
img.save(out, "jpeg", quality=95)
|
||||
thumb_jpeg = out.getbuffer()
|
||||
|
||||
return full_jpeg, thumb_jpeg
|
||||
return out.getbuffer()
|
||||
|
||||
def brozzle_page(self, browser, site, page, on_screenshot=None,
|
||||
on_request=None, enable_youtube_dl=True):
|
||||
@ -226,15 +222,14 @@ class BrozzlerWorker:
|
||||
return outlinks
|
||||
|
||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||
def _on_screenshot(screenshot_png):
|
||||
def _on_screenshot(screenshot_jpeg):
|
||||
if on_screenshot:
|
||||
on_screenshot(screenshot_png)
|
||||
on_screenshot(screenshot_jpeg)
|
||||
if self._using_warcprox(site):
|
||||
self.logger.info(
|
||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||
"screenshot for %s", self._proxy_for(site), page)
|
||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||
screenshot_png)
|
||||
thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg)
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy_for(site),
|
||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||
@ -302,6 +297,7 @@ class BrozzlerWorker:
|
||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||
skip_youtube_dl=self._skip_youtube_dl,
|
||||
screenshot_full_page=self._screenshot_full_page,
|
||||
page_timeout=self._page_timeout,
|
||||
behavior_timeout=self._behavior_timeout)
|
||||
if final_page_url != page.url:
|
||||
|
6
pytest.ini
Normal file
6
pytest.ini
Normal file
@ -0,0 +1,6 @@
|
||||
# https://docs.pytest.org/en/latest/logging.html
|
||||
# https://github.com/pytest-dev/pytest/issues/5296
|
||||
[pytest]
|
||||
log_format = %(asctime)s.%(msecs)03d %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s
|
||||
log_date_format = %Y-%m-%d %H:%M:%S
|
||||
|
3
setup.py
3
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.5.8',
|
||||
version='1.5.11',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -95,7 +95,6 @@ setuptools.setup(
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Programming Language :: Python :: 3.5',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
|
@ -32,6 +32,7 @@ import requests
|
||||
import subprocess
|
||||
import http.server
|
||||
import logging
|
||||
import sys
|
||||
import warcprox
|
||||
|
||||
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
|
||||
|
@ -733,7 +733,7 @@ def test_hashtag_seed():
|
||||
assert pages[0].hashtags == ['#hash',]
|
||||
|
||||
def test_hashtag_links():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
rr = doublethink.Rethinker('localhost', db='test_hashtag_links')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||
|
Loading…
x
Reference in New Issue
Block a user