replace vagrant-brozzler-new-site with python script that fills in default options and passes through others

This commit is contained in:
Noah Levitt 2016-09-22 01:47:23 +01:00
parent cc9517cb45
commit 2462efc4ed
3 changed files with 87 additions and 15 deletions

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b6.dev84',
version='1.1b6.dev85',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python
'''
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
queue a site for your vagrant brozzler deployment.
Fills in the --proxy option automatically. some other options are passed
through.
This is a standalone script with no dependencies other than python, and should
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
so we can use the argparse library.
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import sys
import os
import argparse
import subprocess
try:
from shlex import quote
except:
from pipes import quote
def main(argv=[]):
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None,
help='time limit in seconds for this site')
arg_parser.add_argument(
'--ignore-robots', dest='ignore_robots', action='store_true',
help='ignore robots.txt for this site')
arg_parser.add_argument(
'--warcprox-meta', dest='warcprox_meta',
help=(
'Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are '
'enabled'))
arg_parser.add_argument(
'-q', '--quiet', dest='quiet', action='store_true')
arg_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true')
args = arg_parser.parse_args(args=argv[1:])
options = []
if args.time_limit:
options.append('--time-limit=%s' % args.time_limit)
if args.ignore_robots:
options.append('--ignore-robots')
if args.warcprox_meta:
# I think this shell escaping is correct?
options.append(
'--warcprox-meta=%s' % quote(args.warcprox_meta))
if args.quiet:
options.append('--quiet')
if args.verbose:
options.append('--verbose')
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__))
cmd = (
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
'/home/vagrant/brozzler-ve34/bin/python '
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
'--proxy=localhost:8000 --enable-warcprox-features %s %s') % (
' '.join(options), args.seed)
subprocess.call(['vagrant', 'ssh', '--', cmd])
if __name__ == '__main__':
main(sys.argv)

View File

@ -1,14 +0,0 @@
#!/bin/bash
#
# vagrant-brozzler-new-site.sh - run brozzler-new-site inside the vagrant vm to
# queue a job for your vagrant brozzler deployment
#
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd $script_dir
vagrant ssh -- \
PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \
/home/vagrant/brozzler-ve34/bin/python \
/home/vagrant/brozzler-ve34/bin/brozzler-new-site "$@"