diff --git a/setup.py b/setup.py index ff1df78..de5c821 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev84', + version='1.1b6.dev85', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py new file mode 100755 index 0000000..7e073aa --- /dev/null +++ b/vagrant/vagrant-brozzler-new-site.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +''' +vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to +queue a site for your vagrant brozzler deployment. + +Fills in the --proxy option automatically. some other options are passed +through. + +This is a standalone script with no dependencies other than python, and should +work with python 2.7 or python 3.2+. The only reason it's not a bash script is +so we can use the argparse library. + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import sys +import os +import argparse +import subprocess +try: + from shlex import quote +except: + from pipes import quote + +def main(argv=[]): + arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) + arg_parser.add_argument('seed', metavar='SEED', help='seed url') + arg_parser.add_argument( + '--time-limit', dest='time_limit', default=None, + help='time limit in seconds for this site') + arg_parser.add_argument( + '--ignore-robots', dest='ignore_robots', action='store_true', + help='ignore robots.txt for this site') + arg_parser.add_argument( + '--warcprox-meta', dest='warcprox_meta', + help=( + 'Warcprox-Meta http request header to send with each request; ' + 'must be a json blob, ignored unless warcprox features are ' + 'enabled')) + arg_parser.add_argument( + '-q', '--quiet', dest='quiet', action='store_true') + arg_parser.add_argument( + '-v', '--verbose', dest='verbose', action='store_true') + + args = arg_parser.parse_args(args=argv[1:]) + + options = [] + if args.time_limit: + options.append('--time-limit=%s' % args.time_limit) + if args.ignore_robots: + options.append('--ignore-robots') + if args.warcprox_meta: + # I think this shell escaping is correct? + options.append( + '--warcprox-meta=%s' % quote(args.warcprox_meta)) + if args.quiet: + options.append('--quiet') + if args.verbose: + options.append('--verbose') + + # cd to path with Vagrantfile so "vagrant ssh" knows what to do + os.chdir(os.path.dirname(__file__)) + + cmd = ( + 'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' + '/home/vagrant/brozzler-ve34/bin/python ' + '/home/vagrant/brozzler-ve34/bin/brozzler-new-site ' + '--proxy=localhost:8000 --enable-warcprox-features %s %s') % ( + ' '.join(options), args.seed) + subprocess.call(['vagrant', 'ssh', '--', cmd]) + +if __name__ == '__main__': + main(sys.argv) + diff --git a/vagrant/vagrant-brozzler-new-site.sh b/vagrant/vagrant-brozzler-new-site.sh deleted file mode 100755 index bf45648..0000000 --- a/vagrant/vagrant-brozzler-new-site.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# -# vagrant-brozzler-new-site.sh - run brozzler-new-site inside the vagrant vm to -# queue a job for your vagrant brozzler deployment -# - -# cd to path with Vagrantfile so "vagrant ssh" knows what to do -script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd $script_dir - -vagrant ssh -- \ - PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \ - /home/vagrant/brozzler-ve34/bin/python \ - /home/vagrant/brozzler-ve34/bin/brozzler-new-site "$@"