mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: oops, back to dev version number wait 20 seconds to claim sites if none were avail- tweak logging why did those tests fail??? (#117) Add screenshots Add screenshots back to dev version 1.4 for pypi explain --warcprox-auto briefly vagrant readme fixes (thanks funkyfuture) update cryptography dep version
This commit is contained in:
commit
c4fdbe578d
@ -16,7 +16,7 @@ install:
|
||||
- chromium-browser --version
|
||||
- sudo service brozzler-worker restart
|
||||
script:
|
||||
- DISPLAY=:1 py.test -v tests
|
||||
- DISPLAY=:1 py.test --tb=native -v tests
|
||||
after_failure:
|
||||
- chromium-browser --version
|
||||
- sudo cat /var/log/upstart/warcprox.log
|
||||
|
BIN
Brozzler-Dashboard.png
Normal file
BIN
Brozzler-Dashboard.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
BIN
Brozzler-Wayback.png
Normal file
BIN
Brozzler-Wayback.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.1 MiB |
16
README.rst
16
README.rst
@ -1,4 +1,4 @@
|
||||
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
|
||||
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=1.4
|
||||
:target: https://travis-ci.org/internetarchive/brozzler
|
||||
|
||||
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg
|
||||
@ -73,7 +73,7 @@ To install brozzler only::
|
||||
|
||||
pip install brozzler # in a virtualenv if desired
|
||||
|
||||
Launch one or more workers::
|
||||
Launch one or more workers: [*]_ ::
|
||||
|
||||
brozzler-worker --warcprox-auto
|
||||
|
||||
@ -85,6 +85,14 @@ Submit sites not tied to a job::
|
||||
|
||||
brozzler-new-site --time-limit=600 http://example.com/
|
||||
|
||||
.. [*] A note about ``--warcprox-auto``: this option tells brozzler to
|
||||
look for a healthy warcprox instance in the `rethinkdb service registry
|
||||
<https://github.com/internetarchive/doublethink#service-registry>`_. For
|
||||
this to work you need to have at least one instance of warcprox running,
|
||||
with the ``--rethinkdb-services-url`` option pointing to the same rethinkdb
|
||||
services table that brozzler is using. Using ``--warcprox-auto`` is
|
||||
recommended for clustered deployments.
|
||||
|
||||
Job Configuration
|
||||
-----------------
|
||||
|
||||
@ -129,6 +137,8 @@ To start the app, run
|
||||
|
||||
At this point Brozzler Dashboard will be accessible at http://localhost:8000/.
|
||||
|
||||
.. image:: Brozzler-Dashboard.png
|
||||
|
||||
See ``brozzler-dashboard --help`` for configuration options.
|
||||
|
||||
Brozzler Wayback
|
||||
@ -170,6 +180,8 @@ Run pywb like so:
|
||||
|
||||
Then browse http://localhost:8880/brozzler/.
|
||||
|
||||
.. image:: Brozzler-Wayback.png
|
||||
|
||||
Headless Chrome (experimental)
|
||||
------------------------------
|
||||
|
||||
|
@ -17,6 +17,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pkg_resources import get_distribution as _get_distribution
|
||||
__version__ = _get_distribution('brozzler').version
|
||||
|
||||
@ -57,18 +58,22 @@ class ReachedLimit(Exception):
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
|
||||
# monkey-patch log level TRACE
|
||||
TRACE = 5
|
||||
import logging
|
||||
def _logging_trace(msg, *args, **kwargs):
|
||||
logging.root.trace(msg, *args, **kwargs)
|
||||
# monkey-patch log levels TRACE and NOTICE
|
||||
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
|
||||
def _logger_trace(self, msg, *args, **kwargs):
|
||||
if self.isEnabledFor(TRACE):
|
||||
self._log(TRACE, msg, args, **kwargs)
|
||||
logging.trace = _logging_trace
|
||||
if self.isEnabledFor(logging.TRACE):
|
||||
self._log(logging.TRACE, msg, args, **kwargs)
|
||||
logging.Logger.trace = _logger_trace
|
||||
logging._levelToName[TRACE] = 'TRACE'
|
||||
logging._nameToLevel['TRACE'] = TRACE
|
||||
logging.trace = logging.root.trace
|
||||
logging.addLevelName(logging.TRACE, 'TRACE')
|
||||
|
||||
logging.NOTICE = (logging.INFO + logging.WARN) // 2
|
||||
def _logger_notice(self, msg, *args, **kwargs):
|
||||
if self.isEnabledFor(logging.NOTICE):
|
||||
self._log(logging.NOTICE, msg, args, **kwargs)
|
||||
logging.Logger.notice = _logger_notice
|
||||
logging.notice = logging.root.notice
|
||||
logging.addLevelName(logging.NOTICE, 'NOTICE')
|
||||
|
||||
# see https://github.com/internetarchive/brozzler/issues/91
|
||||
def _logging_handler_handle(self, record):
|
||||
|
@ -317,7 +317,7 @@ class Browser:
|
||||
kwargs['id'] = msg_id
|
||||
msg = json.dumps(kwargs, separators=',:')
|
||||
logging.log(
|
||||
brozzler.TRACE if suppress_logging else logging.DEBUG,
|
||||
logging.TRACE if suppress_logging else logging.DEBUG,
|
||||
'sending message to %s: %s', self.websock, msg)
|
||||
self.websock.send(msg)
|
||||
return msg_id
|
||||
|
@ -43,15 +43,14 @@ def add_common_options(arg_parser, argv=None):
|
||||
argv = argv or sys.argv
|
||||
arg_parser.add_argument(
|
||||
'-q', '--quiet', dest='log_level', action='store_const',
|
||||
default=logging.INFO, const=logging.WARN, help=(
|
||||
'quiet logging, only warnings and errors'))
|
||||
default=logging.INFO, const=logging.NOTICE, help='quiet logging')
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='log_level', action='store_const',
|
||||
default=logging.INFO, const=logging.DEBUG, help=(
|
||||
'verbose logging'))
|
||||
arg_parser.add_argument(
|
||||
'--trace', dest='log_level', action='store_const',
|
||||
default=logging.INFO, const=brozzler.TRACE, help=(
|
||||
default=logging.INFO, const=logging.TRACE, help=(
|
||||
'very verbose logging'))
|
||||
# arg_parser.add_argument(
|
||||
# '-s', '--silent', dest='log_level', action='store_const',
|
||||
|
@ -94,6 +94,7 @@ class RethinkDbFrontier:
|
||||
k, expected, result))
|
||||
|
||||
def claim_sites(self, n=1):
|
||||
self.logger.trace('claiming up to %s sites to brozzle', n)
|
||||
result = (
|
||||
self.rr.table('sites').get_all(r.args(
|
||||
r.db(self.rr.dbname).table('sites', read_mode='majority')
|
||||
@ -145,6 +146,7 @@ class RethinkDbFrontier:
|
||||
result["changes"][i]["old_val"]["last_claimed"])
|
||||
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
|
||||
sites.append(site)
|
||||
self.logger.debug('claimed %s sites', len(sites))
|
||||
if sites:
|
||||
return sites
|
||||
else:
|
||||
|
@ -446,6 +446,7 @@ class BrozzlerWorker:
|
||||
Raises:
|
||||
NoBrowsersAvailable if none available
|
||||
'''
|
||||
# acquire_multi() raises NoBrowsersAvailable if none available
|
||||
browsers = self._browser_pool.acquire_multi(
|
||||
(self._browser_pool.num_available() + 1) // 2)
|
||||
try:
|
||||
@ -468,22 +469,26 @@ class BrozzlerWorker:
|
||||
self._browser_pool.release(browsers[i])
|
||||
|
||||
def run(self):
|
||||
self.logger.info("brozzler worker starting")
|
||||
self.logger.notice("brozzler worker starting")
|
||||
last_nothing_to_claim = 0
|
||||
try:
|
||||
while not self._shutdown.is_set():
|
||||
self._service_heartbeat_if_due()
|
||||
try:
|
||||
self._start_browsing_some_sites()
|
||||
except brozzler.browser.NoBrowsersAvailable:
|
||||
logging.trace(
|
||||
"all %s browsers are in use", self._max_browsers)
|
||||
except brozzler.NothingToClaim:
|
||||
logging.trace(
|
||||
"all active sites are already claimed by a "
|
||||
"brozzler worker")
|
||||
if time.time() - last_nothing_to_claim > 20:
|
||||
try:
|
||||
self._start_browsing_some_sites()
|
||||
except brozzler.browser.NoBrowsersAvailable:
|
||||
logging.trace(
|
||||
"all %s browsers are in use",
|
||||
self._max_browsers)
|
||||
except brozzler.NothingToClaim:
|
||||
last_nothing_to_claim = time.time()
|
||||
logging.trace(
|
||||
"nothing to claim, all available active sites "
|
||||
"are already claimed by a brozzler worker")
|
||||
time.sleep(0.5)
|
||||
|
||||
self.logger.info("shutdown requested")
|
||||
self.logger.notice("shutdown requested")
|
||||
except r.ReqlError as e:
|
||||
self.logger.error(
|
||||
"caught rethinkdb exception, will try to proceed",
|
||||
|
31
setup.py
31
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.4.dev297',
|
||||
version='1.5.dev302',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -63,32 +63,35 @@ setuptools.setup(
|
||||
],
|
||||
},
|
||||
install_requires=[
|
||||
'PyYAML',
|
||||
'youtube-dl',
|
||||
'PyYAML>=3.12',
|
||||
'youtube-dl>=2018.7.21',
|
||||
'reppy==0.3.4',
|
||||
'requests',
|
||||
'websocket-client!=0.39.0,!=0.49.0',
|
||||
'python-magic',
|
||||
'requests>=2.18.4',
|
||||
'websocket-client>=0.39.0,<=0.48.0',
|
||||
'pillow>=5.2.0',
|
||||
'urlcanon>=0.1.dev23',
|
||||
'doublethink>=0.2.0.dev88',
|
||||
'rethinkdb>=2.3,<2.4',
|
||||
'cerberus==1.0.1',
|
||||
'jinja2',
|
||||
'cryptography!=2.1.1', # 2.1.1 installation is failing on ubuntu
|
||||
'python-magic',
|
||||
'rethinkdb>=2.3',
|
||||
'cerberus>=1.0.1',
|
||||
'jinja2>=2.10',
|
||||
'cryptography>=2.3',
|
||||
],
|
||||
extras_require={
|
||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||
'dashboard': [
|
||||
'flask>=0.11',
|
||||
'gunicorn>=19.8.1'
|
||||
],
|
||||
'easy': [
|
||||
'warcprox>=2.4b2.dev173',
|
||||
'pywb<2',
|
||||
'pywb>=0.33.2,<2',
|
||||
'flask>=0.11',
|
||||
'gunicorn'
|
||||
'gunicorn>=19.8.1'
|
||||
],
|
||||
},
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
'Development Status :: 4 - Beta',
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
'License :: OSI Approved :: Apache Software License',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
|
@ -53,9 +53,9 @@ def test_run_command(capsys, cmd):
|
||||
proc = subprocess.Popen(
|
||||
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
out, err = proc.communicate()
|
||||
assert err == b''
|
||||
assert out == ('brozzler %s - %s\n' % (
|
||||
brozzler.__version__, cmd)).encode('ascii')
|
||||
assert err == b''
|
||||
|
||||
def test_rethinkdb_up():
|
||||
'''Check that rethinkdb is up and running.'''
|
||||
|
@ -1,15 +1,14 @@
|
||||
Single-VM Vagrant Brozzler Deployment
|
||||
-------------------------------------
|
||||
|
||||
This is a work in progress. Vagrant + ansible configuration for a single-vm
|
||||
deployment of brozzler and warcprox with dependencies (notably rethinkdb).
|
||||
This is a vagrant + ansible configuration for a single-vm deployment of
|
||||
brozzler and warcprox with dependencies (notably rethinkdb).
|
||||
|
||||
The idea is for this to be a quick way for people to get up and running with a
|
||||
deployment resembling a real distributed deployment, and to offer a starting
|
||||
configuration for people to adapt to their clusters.
|
||||
|
||||
And equally important, as a harness for integration tests. (As of now brozzler
|
||||
itself has no automated tests!)
|
||||
And equally important, as a harness for integration tests.
|
||||
|
||||
You'll need vagrant installed.
|
||||
https://www.vagrantup.com/docs/installation/
|
||||
@ -25,27 +24,27 @@ the brozzler virtualenv.
|
||||
::
|
||||
|
||||
my-laptop$ vagrant ssh
|
||||
vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$
|
||||
vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate
|
||||
(brozzler-ve34)vagrant@brzl:~$
|
||||
|
||||
Then you can run brozzler-new-site:
|
||||
|
||||
::
|
||||
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
|
||||
--proxy=localhost:8000 http://example.com/
|
||||
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
|
||||
|
||||
|
||||
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||
|
||||
::
|
||||
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
|
||||
(brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <<EOF
|
||||
id: job1
|
||||
proxy: localhost:8000 # point at warcprox for archiving
|
||||
seeds:
|
||||
- url: https://example.org/
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
|
||||
- url: https://example.org/
|
||||
EOF
|
||||
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-job job1.yml
|
||||
|
||||
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
|
||||
./logs (via vagrant folders syncing).
|
||||
|
Loading…
x
Reference in New Issue
Block a user