mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-24 06:34:54 -04:00
Merge 6ed234bb68
into 8afe9b5014
This commit is contained in:
commit
000032bcb2
4 changed files with 9 additions and 20 deletions
15
job-conf.rst
15
job-conf.rst
|
@ -14,7 +14,6 @@ Example
|
||||||
|
|
||||||
id: myjob
|
id: myjob
|
||||||
time_limit: 60 # seconds
|
time_limit: 60 # seconds
|
||||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
|
||||||
ignore_robots: false
|
ignore_robots: false
|
||||||
max_claimed_sites: 2
|
max_claimed_sites: 2
|
||||||
warcprox_meta:
|
warcprox_meta:
|
||||||
|
@ -186,16 +185,6 @@ enforced at the seed level. If a time limit is specified at the top level, it
|
||||||
is inherited by each seed as described above, and enforced individually on each
|
is inherited by each seed as described above, and enforced individually on each
|
||||||
seed.
|
seed.
|
||||||
|
|
||||||
``proxy``
|
|
||||||
~~~~~~~~~
|
|
||||||
+--------+----------+---------+
|
|
||||||
| type | required | default |
|
|
||||||
+========+==========+=========+
|
|
||||||
| string | no | *none* |
|
|
||||||
+--------+----------+---------+
|
|
||||||
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
|
||||||
warcprox for archival crawling.
|
|
||||||
|
|
||||||
``ignore_robots``
|
``ignore_robots``
|
||||||
~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~
|
||||||
+---------+----------+-----------+
|
+---------+----------+-----------+
|
||||||
|
@ -226,8 +215,8 @@ to contact the operator if the crawl is causing problems.
|
||||||
+============+==========+===========+
|
+============+==========+===========+
|
||||||
| dictionary | no | ``false`` |
|
| dictionary | no | ``false`` |
|
||||||
+------------+----------+-----------+
|
+------------+----------+-----------+
|
||||||
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
|
Specifies the ``Warcprox-Meta`` header to send with every request.
|
||||||
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
|
The value of the ``Warcprox-Meta`` header is a json blob. It is
|
||||||
used to pass settings and information to warcprox. Warcprox does not forward
|
used to pass settings and information to warcprox. Warcprox does not forward
|
||||||
the header on to the remote site. For further explanation of this field and
|
the header on to the remote site. For further explanation of this field and
|
||||||
its uses see
|
its uses see
|
||||||
|
|
|
@ -31,16 +31,15 @@ Then you can run brozzler-new-site:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
|
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site http://example.com/
|
||||||
|
|
||||||
|
|
||||||
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
Or brozzler-new-job:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
(brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
|
(brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
|
||||||
id: job1
|
id: job1
|
||||||
proxy: localhost:8000 # point at warcprox for archiving
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.org/
|
- url: https://example.org/
|
||||||
EOF
|
EOF
|
||||||
|
|
|
@ -33,16 +33,17 @@ def main(argv=[]):
|
||||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
'job_conf_file', metavar='JOB_CONF_FILE',
|
||||||
help='brozzler job configuration file in yaml')
|
help='brozzler job configuration file in yaml')
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
args.job_conf_file = os.path.realpath(args.job_conf_file)
|
||||||
|
|
||||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.realpath(os.path.dirname(__file__)))
|
||||||
|
|
||||||
with open(args.job_conf_file, 'rb') as f:
|
with open(args.job_conf_file, 'rb') as f:
|
||||||
subprocess.call([
|
subprocess.call([
|
||||||
'vagrant', 'ssh', '--',
|
'vagrant', 'ssh', '--',
|
||||||
'f=`mktemp` && cat > $f && '
|
'f=`mktemp` && cat > $f && '
|
||||||
'/home/vagrant/brozzler-ve3/bin/python '
|
'/opt/brozzler-ve3/bin/python '
|
||||||
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
|
'/opt/brozzler-ve3/bin/brozzler-new-job $f'],
|
||||||
stdin=f)
|
stdin=f)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -71,7 +71,7 @@ def main(argv=[]):
|
||||||
options.append('--verbose')
|
options.append('--verbose')
|
||||||
|
|
||||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.realpath(os.path.dirname(__file__)))
|
||||||
|
|
||||||
cmd = (
|
cmd = (
|
||||||
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
|
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue