mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
replace vagrant-brozzler-new-site with python script that fills in default options and passes through others
This commit is contained in:
parent
cc9517cb45
commit
2462efc4ed
3 changed files with 87 additions and 15 deletions
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b6.dev84',
|
version='1.1b6.dev85',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
86
vagrant/vagrant-brozzler-new-site.py
Executable file
86
vagrant/vagrant-brozzler-new-site.py
Executable file
|
@ -0,0 +1,86 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
'''
|
||||||
|
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
|
||||||
|
queue a site for your vagrant brozzler deployment.
|
||||||
|
|
||||||
|
Fills in the --proxy option automatically. some other options are passed
|
||||||
|
through.
|
||||||
|
|
||||||
|
This is a standalone script with no dependencies other than python, and should
|
||||||
|
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||||
|
so we can use the argparse library.
|
||||||
|
|
||||||
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
try:
|
||||||
|
from shlex import quote
|
||||||
|
except:
|
||||||
|
from pipes import quote
|
||||||
|
|
||||||
|
def main(argv=[]):
|
||||||
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
||||||
|
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--time-limit', dest='time_limit', default=None,
|
||||||
|
help='time limit in seconds for this site')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--ignore-robots', dest='ignore_robots', action='store_true',
|
||||||
|
help='ignore robots.txt for this site')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--warcprox-meta', dest='warcprox_meta',
|
||||||
|
help=(
|
||||||
|
'Warcprox-Meta http request header to send with each request; '
|
||||||
|
'must be a json blob, ignored unless warcprox features are '
|
||||||
|
'enabled'))
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'-q', '--quiet', dest='quiet', action='store_true')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'-v', '--verbose', dest='verbose', action='store_true')
|
||||||
|
|
||||||
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
|
||||||
|
options = []
|
||||||
|
if args.time_limit:
|
||||||
|
options.append('--time-limit=%s' % args.time_limit)
|
||||||
|
if args.ignore_robots:
|
||||||
|
options.append('--ignore-robots')
|
||||||
|
if args.warcprox_meta:
|
||||||
|
# I think this shell escaping is correct?
|
||||||
|
options.append(
|
||||||
|
'--warcprox-meta=%s' % quote(args.warcprox_meta))
|
||||||
|
if args.quiet:
|
||||||
|
options.append('--quiet')
|
||||||
|
if args.verbose:
|
||||||
|
options.append('--verbose')
|
||||||
|
|
||||||
|
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||||
|
os.chdir(os.path.dirname(__file__))
|
||||||
|
|
||||||
|
cmd = (
|
||||||
|
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
||||||
|
'/home/vagrant/brozzler-ve34/bin/python '
|
||||||
|
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
|
||||||
|
'--proxy=localhost:8000 --enable-warcprox-features %s %s') % (
|
||||||
|
' '.join(options), args.seed)
|
||||||
|
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main(sys.argv)
|
||||||
|
|
|
@ -1,14 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#
|
|
||||||
# vagrant-brozzler-new-site.sh - run brozzler-new-site inside the vagrant vm to
|
|
||||||
# queue a job for your vagrant brozzler deployment
|
|
||||||
#
|
|
||||||
|
|
||||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
|
||||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
cd $script_dir
|
|
||||||
|
|
||||||
vagrant ssh -- \
|
|
||||||
PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \
|
|
||||||
/home/vagrant/brozzler-ve34/bin/python \
|
|
||||||
/home/vagrant/brozzler-ve34/bin/brozzler-new-site "$@"
|
|
Loading…
Add table
Add a link
Reference in a new issue