mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge 6ed234bb680b42269f56f94bf26d57eea0b4b752 into 8afe9b50143b6f619e568471372362d9c9a3a5d9
This commit is contained in:
commit
000032bcb2
15
job-conf.rst
15
job-conf.rst
@ -14,7 +14,6 @@ Example
|
||||
|
||||
id: myjob
|
||||
time_limit: 60 # seconds
|
||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||
ignore_robots: false
|
||||
max_claimed_sites: 2
|
||||
warcprox_meta:
|
||||
@ -186,16 +185,6 @@ enforced at the seed level. If a time limit is specified at the top level, it
|
||||
is inherited by each seed as described above, and enforced individually on each
|
||||
seed.
|
||||
|
||||
``proxy``
|
||||
~~~~~~~~~
|
||||
+--------+----------+---------+
|
||||
| type | required | default |
|
||||
+========+==========+=========+
|
||||
| string | no | *none* |
|
||||
+--------+----------+---------+
|
||||
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
||||
warcprox for archival crawling.
|
||||
|
||||
``ignore_robots``
|
||||
~~~~~~~~~~~~~~~~~
|
||||
+---------+----------+-----------+
|
||||
@ -226,8 +215,8 @@ to contact the operator if the crawl is causing problems.
|
||||
+============+==========+===========+
|
||||
| dictionary | no | ``false`` |
|
||||
+------------+----------+-----------+
|
||||
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
||||
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||
Specifies the ``Warcprox-Meta`` header to send with every request.
|
||||
The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||
used to pass settings and information to warcprox. Warcprox does not forward
|
||||
the header on to the remote site. For further explanation of this field and
|
||||
its uses see
|
||||
|
@ -31,16 +31,15 @@ Then you can run brozzler-new-site:
|
||||
|
||||
::
|
||||
|
||||
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
|
||||
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site http://example.com/
|
||||
|
||||
|
||||
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||
Or brozzler-new-job:
|
||||
|
||||
::
|
||||
|
||||
(brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
|
||||
id: job1
|
||||
proxy: localhost:8000 # point at warcprox for archiving
|
||||
seeds:
|
||||
- url: https://example.org/
|
||||
EOF
|
||||
|
@ -33,16 +33,17 @@ def main(argv=[]):
|
||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
||||
help='brozzler job configuration file in yaml')
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
args.job_conf_file = os.path.realpath(args.job_conf_file)
|
||||
|
||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
os.chdir(os.path.realpath(os.path.dirname(__file__)))
|
||||
|
||||
with open(args.job_conf_file, 'rb') as f:
|
||||
subprocess.call([
|
||||
'vagrant', 'ssh', '--',
|
||||
'f=`mktemp` && cat > $f && '
|
||||
'/home/vagrant/brozzler-ve3/bin/python '
|
||||
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
|
||||
'/opt/brozzler-ve3/bin/python '
|
||||
'/opt/brozzler-ve3/bin/brozzler-new-job $f'],
|
||||
stdin=f)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -71,7 +71,7 @@ def main(argv=[]):
|
||||
options.append('--verbose')
|
||||
|
||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
os.chdir(os.path.realpath(os.path.dirname(__file__)))
|
||||
|
||||
cmd = (
|
||||
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
|
||||
|
Loading…
x
Reference in New Issue
Block a user