mirror of
				https://github.com/internetarchive/brozzler.git
				synced 2025-10-30 22:08:54 -04:00 
			
		
		
		
	Merge 6ed234bb68 into 8afe9b5014
				
					
				
			This commit is contained in:
		
						commit
						000032bcb2
					
				
					 4 changed files with 9 additions and 20 deletions
				
			
		
							
								
								
									
										15
									
								
								job-conf.rst
									
										
									
									
									
								
							
							
						
						
									
										15
									
								
								job-conf.rst
									
										
									
									
									
								
							|  | @ -14,7 +14,6 @@ Example | |||
| 
 | ||||
|     id: myjob | ||||
|     time_limit: 60 # seconds | ||||
|     proxy: 127.0.0.1:8000 # point at warcprox for archiving | ||||
|     ignore_robots: false | ||||
|     max_claimed_sites: 2 | ||||
|     warcprox_meta: | ||||
|  | @ -186,16 +185,6 @@ enforced at the seed level. If a time limit is specified at the top level, it | |||
| is inherited by each seed as described above, and enforced individually on each | ||||
| seed. | ||||
| 
 | ||||
| ``proxy`` | ||||
| ~~~~~~~~~ | ||||
| +--------+----------+---------+ | ||||
| | type   | required | default | | ||||
| +========+==========+=========+ | ||||
| | string | no       | *none*  | | ||||
| +--------+----------+---------+ | ||||
| HTTP proxy, with the format ``host:port``. Typically configured to point to | ||||
| warcprox for archival crawling. | ||||
| 
 | ||||
| ``ignore_robots`` | ||||
| ~~~~~~~~~~~~~~~~~ | ||||
| +---------+----------+-----------+ | ||||
|  | @ -226,8 +215,8 @@ to contact the operator if the crawl is causing problems. | |||
| +============+==========+===========+ | ||||
| | dictionary | no       | ``false`` | | ||||
| +------------+----------+-----------+ | ||||
| Specifies the ``Warcprox-Meta`` header to send with every request, if ``proxy`` | ||||
| is configured. The value of the ``Warcprox-Meta`` header is a json blob. It is | ||||
| Specifies the ``Warcprox-Meta`` header to send with every request. | ||||
| The value of the ``Warcprox-Meta`` header is a json blob. It is | ||||
| used to pass settings and information to warcprox. Warcprox does not forward | ||||
| the header on to the remote site. For further explanation of this field and | ||||
| its uses see | ||||
|  |  | |||
|  | @ -31,16 +31,15 @@ Then you can run brozzler-new-site: | |||
| 
 | ||||
| :: | ||||
| 
 | ||||
|     (brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/ | ||||
|     (brozzler-ve3)vagrant@brzl:~$ brozzler-new-site http://example.com/ | ||||
| 
 | ||||
| 
 | ||||
| Or brozzler-new-job (make sure to set the proxy to localhost:8000): | ||||
| Or brozzler-new-job: | ||||
| 
 | ||||
| :: | ||||
| 
 | ||||
|     (brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF | ||||
|     id: job1 | ||||
|     proxy: localhost:8000 # point at warcprox for archiving | ||||
|     seeds: | ||||
|     - url: https://example.org/ | ||||
|     EOF | ||||
|  |  | |||
|  | @ -33,16 +33,17 @@ def main(argv=[]): | |||
|             'job_conf_file', metavar='JOB_CONF_FILE', | ||||
|             help='brozzler job configuration file in yaml') | ||||
|     args = arg_parser.parse_args(args=argv[1:]) | ||||
|     args.job_conf_file = os.path.realpath(args.job_conf_file) | ||||
| 
 | ||||
|     # cd to path with Vagrantfile so "vagrant ssh" knows what to do | ||||
|     os.chdir(os.path.dirname(__file__)) | ||||
|     os.chdir(os.path.realpath(os.path.dirname(__file__))) | ||||
| 
 | ||||
|     with open(args.job_conf_file, 'rb') as f: | ||||
|         subprocess.call([ | ||||
|             'vagrant', 'ssh', '--', | ||||
|             'f=`mktemp` && cat > $f && ' | ||||
|             '/home/vagrant/brozzler-ve3/bin/python ' | ||||
|             '/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'], | ||||
|             '/opt/brozzler-ve3/bin/python ' | ||||
|             '/opt/brozzler-ve3/bin/brozzler-new-job $f'], | ||||
|             stdin=f) | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|  |  | |||
|  | @ -71,7 +71,7 @@ def main(argv=[]): | |||
|         options.append('--verbose') | ||||
| 
 | ||||
|     # cd to path with Vagrantfile so "vagrant ssh" knows what to do | ||||
|     os.chdir(os.path.dirname(__file__)) | ||||
|     os.chdir(os.path.realpath(os.path.dirname(__file__))) | ||||
| 
 | ||||
|     cmd = ( | ||||
|         '/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site ' | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Lauren Ko
						Lauren Ko