Merge 6ed234bb680b42269f56f94bf26d57eea0b4b752 into 8afe9b50143b6f619e568471372362d9c9a3a5d9

This commit is contained in:
Lauren Ko 2023-12-22 08:13:40 -07:00 committed by GitHub
commit 000032bcb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 9 additions and 20 deletions

View File

@ -14,7 +14,6 @@ Example
id: myjob
time_limit: 60 # seconds
proxy: 127.0.0.1:8000 # point at warcprox for archiving
ignore_robots: false
max_claimed_sites: 2
warcprox_meta:
@ -186,16 +185,6 @@ enforced at the seed level. If a time limit is specified at the top level, it
is inherited by each seed as described above, and enforced individually on each
seed.
``proxy``
~~~~~~~~~
+--------+----------+---------+
| type | required | default |
+========+==========+=========+
| string | no | *none* |
+--------+----------+---------+
HTTP proxy, with the format ``host:port``. Typically configured to point to
warcprox for archival crawling.
``ignore_robots``
~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
@ -226,8 +215,8 @@ to contact the operator if the crawl is causing problems.
+============+==========+===========+
| dictionary | no | ``false`` |
+------------+----------+-----------+
Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy``
is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is
Specifies the ``Warcprox-Meta`` header to send with every request.
The value of the ``Warcprox-Meta`` header is a json blob. It is
used to pass settings and information to warcprox. Warcprox does not forward
the header on to the remote site. For further explanation of this field and
its uses see

View File

@ -31,16 +31,15 @@ Then you can run brozzler-new-site:
::
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site http://example.com/
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
Or brozzler-new-job:
::
(brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
id: job1
proxy: localhost:8000 # point at warcprox for archiving
seeds:
- url: https://example.org/
EOF

View File

@ -33,16 +33,17 @@ def main(argv=[]):
'job_conf_file', metavar='JOB_CONF_FILE',
help='brozzler job configuration file in yaml')
args = arg_parser.parse_args(args=argv[1:])
args.job_conf_file = os.path.realpath(args.job_conf_file)
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__))
os.chdir(os.path.realpath(os.path.dirname(__file__)))
with open(args.job_conf_file, 'rb') as f:
subprocess.call([
'vagrant', 'ssh', '--',
'f=`mktemp` && cat > $f && '
'/home/vagrant/brozzler-ve3/bin/python '
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
'/opt/brozzler-ve3/bin/python '
'/opt/brozzler-ve3/bin/brozzler-new-job $f'],
stdin=f)
if __name__ == '__main__':

View File

@ -71,7 +71,7 @@ def main(argv=[]):
options.append('--verbose')
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__))
os.chdir(os.path.realpath(os.path.dirname(__file__)))
cmd = (
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '