mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
Refactor the way the proxy is configured. Job/site settings "proxy" and "enable_warcprox_features" are gone. Brozzler-worker now has mutually exclusive options --proxy and --warcprox-auto. --warcprox-auto means find an instance of warcprox in the service registry, and enable warcprox features. --proxy is provided, determines if proxy is warcprox by consulting http://{proxy_address}/status (see 8caae0d7d3
), and enables warcprox features if so.
This commit is contained in:
parent
9a2f181eb6
commit
934190084c
14 changed files with 208 additions and 153 deletions
|
@ -78,7 +78,7 @@ Launch one or more workers:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
brozzler-worker
|
brozzler-worker --warcprox-auto
|
||||||
|
|
||||||
Submit jobs:
|
Submit jobs:
|
||||||
|
|
||||||
|
@ -90,8 +90,7 @@ Submit sites not tied to a job:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
brozzler-new-site --proxy=localhost:8000 --enable-warcprox-features \
|
brozzler-new-site --time-limit=600 http://example.com/
|
||||||
--time-limit=600 http://example.com/
|
|
||||||
|
|
||||||
Job Configuration
|
Job Configuration
|
||||||
-----------------
|
-----------------
|
||||||
|
@ -106,7 +105,6 @@ everything else is optional. For details, see `<job-conf.rst>`_.
|
||||||
time_limit: 60 # seconds
|
time_limit: 60 # seconds
|
||||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||||
ignore_robots: false
|
ignore_robots: false
|
||||||
enable_warcprox_features: false
|
|
||||||
warcprox_meta: null
|
warcprox_meta: null
|
||||||
metadata: {}
|
metadata: {}
|
||||||
seeds:
|
seeds:
|
||||||
|
|
|
@ -20,4 +20,5 @@ kill timeout 60
|
||||||
|
|
||||||
exec nice brozzler-worker \
|
exec nice brozzler-worker \
|
||||||
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
||||||
--max-browsers=4
|
--max-browsers=4 \
|
||||||
|
--warcprox-auto
|
||||||
|
|
|
@ -81,15 +81,6 @@ def rethinker(args):
|
||||||
'BROZZLER_RETHINKDB_DB') or 'brozzler'
|
'BROZZLER_RETHINKDB_DB') or 'brozzler'
|
||||||
return doublethink.Rethinker(servers.split(','), db)
|
return doublethink.Rethinker(servers.split(','), db)
|
||||||
|
|
||||||
def _add_proxy_options(arg_parser):
|
|
||||||
arg_parser.add_argument(
|
|
||||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
|
||||||
arg_parser.add_argument(
|
|
||||||
'--enable-warcprox-features', dest='enable_warcprox_features',
|
|
||||||
action='store_true', default=None, help=(
|
|
||||||
'enable special features that assume the configured proxy is '
|
|
||||||
'warcprox'))
|
|
||||||
|
|
||||||
def configure_logging(args):
|
def configure_logging(args):
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stderr, level=args.log_level, format=(
|
stream=sys.stderr, level=args.log_level, format=(
|
||||||
|
@ -159,7 +150,8 @@ def brozzle_page():
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--password', dest='password', default=None,
|
'--password', dest='password', default=None,
|
||||||
help='use this password to try to log in if a login form is found')
|
help='use this password to try to log in if a login form is found')
|
||||||
_add_proxy_options(arg_parser)
|
arg_parser.add_argument(
|
||||||
|
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||||
add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
@ -170,7 +162,6 @@ def brozzle_page():
|
||||||
behavior_parameters = json.loads(args.behavior_parameters)
|
behavior_parameters = json.loads(args.behavior_parameters)
|
||||||
site = brozzler.Site(None, {
|
site = brozzler.Site(None, {
|
||||||
'id': -1, 'seed': args.url, 'proxy': args.proxy,
|
'id': -1, 'seed': args.url, 'proxy': args.proxy,
|
||||||
'enable_warcprox_features': args.enable_warcprox_features,
|
|
||||||
'behavior_parameters': behavior_parameters,
|
'behavior_parameters': behavior_parameters,
|
||||||
'username': args.username, 'password': args.password})
|
'username': args.username, 'password': args.password})
|
||||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||||
|
@ -237,7 +228,6 @@ def brozzler_new_site():
|
||||||
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
||||||
add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_proxy_options(arg_parser)
|
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--time-limit', dest='time_limit', default=None,
|
'--time-limit', dest='time_limit', default=None,
|
||||||
help='time limit in seconds for this site')
|
help='time limit in seconds for this site')
|
||||||
|
@ -273,7 +263,6 @@ def brozzler_new_site():
|
||||||
'proxy': args.proxy,
|
'proxy': args.proxy,
|
||||||
'time_limit': int(args.time_limit) if args.time_limit else None,
|
'time_limit': int(args.time_limit) if args.time_limit else None,
|
||||||
'ignore_robots': args.ignore_robots,
|
'ignore_robots': args.ignore_robots,
|
||||||
'enable_warcprox_features': args.enable_warcprox_features,
|
|
||||||
'warcprox_meta': json.loads(
|
'warcprox_meta': json.loads(
|
||||||
args.warcprox_meta) if args.warcprox_meta else None,
|
args.warcprox_meta) if args.warcprox_meta else None,
|
||||||
'behavior_parameters': json.loads(
|
'behavior_parameters': json.loads(
|
||||||
|
@ -300,6 +289,13 @@ def brozzler_worker():
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-n', '--max-browsers', dest='max_browsers', default='1',
|
'-n', '--max-browsers', dest='max_browsers', default='1',
|
||||||
help='max number of chrome instances simultaneously browsing pages')
|
help='max number of chrome instances simultaneously browsing pages')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--warcprox-auto', dest='warcprox_auto', action='store_true',
|
||||||
|
help=(
|
||||||
|
'when needed, choose an available instance of warcprox from '
|
||||||
|
'the rethinkdb service registry'))
|
||||||
add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
|
|
@ -138,11 +138,9 @@ class BrozzlerEasyController:
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
service_registry = doublethink.ServiceRegistry(rr)
|
service_registry = doublethink.ServiceRegistry(rr)
|
||||||
worker = brozzler.worker.BrozzlerWorker(
|
worker = brozzler.worker.BrozzlerWorker(
|
||||||
frontier, service_registry,
|
frontier, service_registry, chrome_exe=args.chrome_exe,
|
||||||
max_browsers=args.max_browsers,
|
|
||||||
chrome_exe=args.chrome_exe,
|
|
||||||
proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
|
proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
|
||||||
enable_warcprox_features=True)
|
max_browsers=args.max_browsers)
|
||||||
return worker
|
return worker
|
||||||
|
|
||||||
def _init_pywb(self, args):
|
def _init_pywb(self, args):
|
||||||
|
|
|
@ -9,9 +9,6 @@ id:
|
||||||
type: number
|
type: number
|
||||||
min: 0
|
min: 0
|
||||||
|
|
||||||
enable_warcprox_features:
|
|
||||||
type: boolean
|
|
||||||
|
|
||||||
ignore_robots:
|
ignore_robots:
|
||||||
type: boolean
|
type: boolean
|
||||||
|
|
||||||
|
@ -19,10 +16,6 @@ id:
|
||||||
type: dict
|
type: dict
|
||||||
nullable: true
|
nullable: true
|
||||||
|
|
||||||
proxy:
|
|
||||||
type: string
|
|
||||||
nullable: true
|
|
||||||
|
|
||||||
scope:
|
scope:
|
||||||
type: dict
|
type: dict
|
||||||
schema:
|
schema:
|
||||||
|
@ -42,7 +35,7 @@ id:
|
||||||
type: string
|
type: string
|
||||||
|
|
||||||
regex:
|
regex:
|
||||||
type: string # code up a regex type?
|
type: string # code up a cerberus regex type?
|
||||||
|
|
||||||
ssurt:
|
ssurt:
|
||||||
type: string
|
type: string
|
||||||
|
@ -75,10 +68,6 @@ id:
|
||||||
max_hops_off_surt:
|
max_hops_off_surt:
|
||||||
type: integer
|
type: integer
|
||||||
|
|
||||||
# ignored, left for backward compatibility
|
|
||||||
remember_outlinks:
|
|
||||||
type: boolean
|
|
||||||
|
|
||||||
metadata:
|
metadata:
|
||||||
type: dict
|
type: dict
|
||||||
|
|
||||||
|
|
|
@ -47,20 +47,22 @@ def _reppy_rules_getitem(self, agent):
|
||||||
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||||
|
|
||||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||||
def _robots_cache(site):
|
def _robots_cache(site, proxy=None):
|
||||||
class SessionRaiseOn420(requests.Session):
|
class SessionRaiseOn420(requests.Session):
|
||||||
def get(self, url, *args, **kwargs):
|
def get(self, url, *args, **kwargs):
|
||||||
res = super().get(url, *args, **kwargs)
|
res = super().get(url, *args, **kwargs)
|
||||||
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
||||||
raise brozzler.ReachedLimit(warcprox_meta=json.loads(res.headers['warcprox-meta']), http_payload=res.text)
|
raise brozzler.ReachedLimit(
|
||||||
|
warcprox_meta=json.loads(res.headers['warcprox-meta']),
|
||||||
|
http_payload=res.text)
|
||||||
else:
|
else:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
if not site.id in _robots_caches:
|
if not site.id in _robots_caches:
|
||||||
req_sesh = SessionRaiseOn420()
|
req_sesh = SessionRaiseOn420()
|
||||||
req_sesh.verify = False # ignore cert errors
|
req_sesh.verify = False # ignore cert errors
|
||||||
if site.proxy:
|
if proxy:
|
||||||
proxie = "http://{}".format(site.proxy)
|
proxie = "http://%s" % proxy
|
||||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||||
if site.extra_headers():
|
if site.extra_headers():
|
||||||
req_sesh.headers.update(site.extra_headers())
|
req_sesh.headers.update(site.extra_headers())
|
||||||
|
@ -70,14 +72,14 @@ def _robots_cache(site):
|
||||||
|
|
||||||
return _robots_caches[site.id]
|
return _robots_caches[site.id]
|
||||||
|
|
||||||
def is_permitted_by_robots(site, url):
|
def is_permitted_by_robots(site, url, proxy=None):
|
||||||
if site.ignore_robots:
|
if site.ignore_robots:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
tries_left = 10
|
tries_left = 10
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
result = _robots_cache(site).allowed(
|
result = _robots_cache(site, proxy).allowed(
|
||||||
url, site.user_agent or "brozzler")
|
url, site.user_agent or "brozzler")
|
||||||
return result
|
return result
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
|
|
|
@ -80,7 +80,7 @@ class Site(doublethink.Document):
|
||||||
|
|
||||||
def extra_headers(self):
|
def extra_headers(self):
|
||||||
hdrs = {}
|
hdrs = {}
|
||||||
if self.enable_warcprox_features and self.warcprox_meta:
|
if self.warcprox_meta:
|
||||||
hdrs["Warcprox-Meta"] = json.dumps(
|
hdrs["Warcprox-Meta"] = json.dumps(
|
||||||
self.warcprox_meta, separators=(',', ':'))
|
self.warcprox_meta, separators=(',', ':'))
|
||||||
return hdrs
|
return hdrs
|
||||||
|
|
|
@ -101,15 +101,15 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, frontier, service_registry=None, max_browsers=1,
|
self, frontier, service_registry=None, max_browsers=1,
|
||||||
chrome_exe="chromium-browser", proxy=None,
|
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None):
|
||||||
enable_warcprox_features=False):
|
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
|
|
||||||
# these two settings can be overridden by the job/site configuration
|
self._warcprox_auto = warcprox_auto
|
||||||
self._default_proxy = proxy
|
self._proxy = proxy
|
||||||
self._default_enable_warcprox_features = enable_warcprox_features
|
assert not (warcprox_auto and proxy)
|
||||||
|
self._proxy_is_warcprox = None
|
||||||
|
|
||||||
self._browser_pool = brozzler.browser.BrowserPool(
|
self._browser_pool = brozzler.browser.BrowserPool(
|
||||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||||
|
@ -119,12 +119,12 @@ class BrozzlerWorker:
|
||||||
self._thread = None
|
self._thread = None
|
||||||
self._start_stop_lock = threading.Lock()
|
self._start_stop_lock = threading.Lock()
|
||||||
|
|
||||||
def _proxy(self, site):
|
def _proxy_for(self, site):
|
||||||
if site.proxy:
|
if self._proxy:
|
||||||
|
return self._proxy
|
||||||
|
elif site.proxy:
|
||||||
return site.proxy
|
return site.proxy
|
||||||
elif self._default_proxy:
|
elif self._warcprox_auto:
|
||||||
return self._default_proxy
|
|
||||||
elif self._service_registry and self._enable_warcprox_features(site):
|
|
||||||
svc = self._service_registry.available_service('warcprox')
|
svc = self._service_registry.available_service('warcprox')
|
||||||
if svc is None:
|
if svc is None:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
|
@ -138,11 +138,21 @@ class BrozzlerWorker:
|
||||||
return site.proxy
|
return site.proxy
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _enable_warcprox_features(self, site):
|
def _using_warcprox(self, site):
|
||||||
if site.enable_warcprox_features is not None:
|
if self._proxy:
|
||||||
return site.enable_warcprox_features
|
if self._proxy_is_warcprox is None:
|
||||||
|
try:
|
||||||
|
response = requests.get('http://%s/status' % self._proxy)
|
||||||
|
status = json.loads(response.text)
|
||||||
|
self._proxy_is_warcprox = (status['role'] == 'warcprox')
|
||||||
|
except Exception as e:
|
||||||
|
self._proxy_is_warcprox = False
|
||||||
|
logging.info(
|
||||||
|
'%s %s warcprox', self._proxy,
|
||||||
|
'IS' if self._proxy_is_warcprox else 'IS NOT')
|
||||||
|
return self._proxy_is_warcprox
|
||||||
else:
|
else:
|
||||||
return self._default_enable_warcprox_features
|
return bool(site.proxy or self._warcprox_auto)
|
||||||
|
|
||||||
def _youtube_dl(self, destdir, site):
|
def _youtube_dl(self, destdir, site):
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
|
@ -156,12 +166,12 @@ class BrozzlerWorker:
|
||||||
"nopart": True,
|
"nopart": True,
|
||||||
"no_color": True,
|
"no_color": True,
|
||||||
}
|
}
|
||||||
if self._proxy(site):
|
if self._proxy_for(site):
|
||||||
ydl_opts["proxy"] = "http://{}".format(self._proxy(site))
|
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
|
||||||
## XXX (sometimes?) causes chrome debug websocket to go through
|
## XXX (sometimes?) causes chrome debug websocket to go through
|
||||||
## proxy. Maybe not needed thanks to hls_prefer_native.
|
## proxy. Maybe not needed thanks to hls_prefer_native.
|
||||||
## # see https://github.com/rg3/youtube-dl/issues/6087
|
## # see https://github.com/rg3/youtube-dl/issues/6087
|
||||||
## os.environ["http_proxy"] = "http://{}".format(self._proxy(site))
|
## os.environ["http_proxy"] = "http://{}".format(self._proxy_for(site))
|
||||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||||
if site.extra_headers():
|
if site.extra_headers():
|
||||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
||||||
|
@ -224,13 +234,13 @@ class BrozzlerWorker:
|
||||||
info = ydl.extract_info(page.url)
|
info = ydl.extract_info(page.url)
|
||||||
self._remember_videos(page, ydl.brozzler_spy)
|
self._remember_videos(page, ydl.brozzler_spy)
|
||||||
# logging.info('XXX %s', json.dumps(info))
|
# logging.info('XXX %s', json.dumps(info))
|
||||||
if self._proxy(site) and self._enable_warcprox_features(site):
|
if self._using_warcprox(site):
|
||||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||||
"with youtube-dl json for %s", page)
|
"with youtube-dl json for %s", page)
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy(site),
|
warcprox_address=self._proxy_for(site),
|
||||||
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="metadata",
|
warc_type="metadata",
|
||||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||||
|
@ -303,20 +313,20 @@ class BrozzlerWorker:
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_png):
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_png)
|
||||||
if self._proxy(site) and self._enable_warcprox_features(site):
|
if self._using_warcprox(site):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||||
"screenshot for %s", self._proxy(site), page)
|
"screenshot for %s", self._proxy_for(site), page)
|
||||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||||
screenshot_png)
|
screenshot_png)
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy(site),
|
warcprox_address=self._proxy_for(site),
|
||||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=screenshot_jpeg,
|
payload=screenshot_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy(site),
|
warcprox_address=self._proxy_for(site),
|
||||||
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=thumbnail_jpeg,
|
payload=thumbnail_jpeg,
|
||||||
|
@ -347,7 +357,8 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
browser.start(
|
browser.start(
|
||||||
proxy=self._proxy(site), cookie_db=site.get('cookie_db'))
|
proxy=self._proxy_for(site),
|
||||||
|
cookie_db=site.get('cookie_db'))
|
||||||
final_page_url, outlinks = browser.browse_page(
|
final_page_url, outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(),
|
page.url, extra_headers=site.extra_headers(),
|
||||||
behavior_parameters=site.get('behavior_parameters'),
|
behavior_parameters=site.get('behavior_parameters'),
|
||||||
|
@ -360,10 +371,10 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
def _fetch_url(self, site, page):
|
def _fetch_url(self, site, page):
|
||||||
proxies = None
|
proxies = None
|
||||||
if self._proxy(site):
|
if self._proxy_for(site):
|
||||||
proxies = {
|
proxies = {
|
||||||
'http': 'http://%s' % self._proxy(site),
|
'http': 'http://%s' % self._proxy_for(site),
|
||||||
'https': 'http://%s' % self._proxy(site),
|
'https': 'http://%s' % self._proxy_for(site),
|
||||||
}
|
}
|
||||||
|
|
||||||
self.logger.info('fetching %s', page)
|
self.logger.info('fetching %s', page)
|
||||||
|
@ -388,17 +399,19 @@ class BrozzlerWorker:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _brozzle_site(self, browser, site):
|
def brozzle_site(self, browser, site):
|
||||||
page = None
|
|
||||||
try:
|
try:
|
||||||
|
page = None
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while time.time() - start < 7 * 60:
|
while time.time() - start < 7 * 60:
|
||||||
|
site.refresh()
|
||||||
self._frontier.honor_stop_request(site.job_id)
|
self._frontier.honor_stop_request(site.job_id)
|
||||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||||
socket.gethostname(), browser.chrome.port))
|
socket.gethostname(), browser.chrome.port))
|
||||||
|
|
||||||
if (page.needs_robots_check and
|
if (page.needs_robots_check and
|
||||||
not brozzler.is_permitted_by_robots(site, page.url)):
|
not brozzler.is_permitted_by_robots(
|
||||||
|
site, page.url, self._proxy_for(site))):
|
||||||
logging.warn("page %s is blocked by robots.txt", page.url)
|
logging.warn("page %s is blocked by robots.txt", page.url)
|
||||||
page.blocked_by_robots = True
|
page.blocked_by_robots = True
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
|
@ -424,8 +437,13 @@ class BrozzlerWorker:
|
||||||
except:
|
except:
|
||||||
self.logger.critical("unexpected exception", exc_info=True)
|
self.logger.critical("unexpected exception", exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
browser.stop()
|
|
||||||
self._frontier.disclaim_site(site, page)
|
self._frontier.disclaim_site(site, page)
|
||||||
|
|
||||||
|
def _brozzle_site_thread_target(self, browser, site):
|
||||||
|
try:
|
||||||
|
self.brozzle_site(browser, site)
|
||||||
|
finally:
|
||||||
|
browser.stop()
|
||||||
self._browser_pool.release(browser)
|
self._browser_pool.release(browser)
|
||||||
with self._browsing_threads_lock:
|
with self._browsing_threads_lock:
|
||||||
self._browsing_threads.remove(threading.current_thread())
|
self._browsing_threads.remove(threading.current_thread())
|
||||||
|
@ -477,9 +495,10 @@ class BrozzlerWorker:
|
||||||
socket.gethostname(), browser.chrome.port))
|
socket.gethostname(), browser.chrome.port))
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"brozzling site (proxy=%s) %s",
|
"brozzling site (proxy=%s) %s",
|
||||||
repr(self._proxy(site)), site)
|
repr(self._proxy_for(site)), site)
|
||||||
th = threading.Thread(
|
th = threading.Thread(
|
||||||
target=self._brozzle_site, args=(browser, site),
|
target=self._brozzle_site_thread_target,
|
||||||
|
args=(browser, site),
|
||||||
name="BrozzlingThread:%s" % browser.chrome.port,
|
name="BrozzlingThread:%s" % browser.chrome.port,
|
||||||
daemon=True)
|
daemon=True)
|
||||||
with self._browsing_threads_lock:
|
with self._browsing_threads_lock:
|
||||||
|
|
24
job-conf.rst
24
job-conf.rst
|
@ -14,7 +14,6 @@ an example
|
||||||
time_limit: 60 # seconds
|
time_limit: 60 # seconds
|
||||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||||
ignore_robots: false
|
ignore_robots: false
|
||||||
enable_warcprox_features: false
|
|
||||||
warcprox_meta:
|
warcprox_meta:
|
||||||
warc-prefix: job1
|
warc-prefix: job1
|
||||||
stats:
|
stats:
|
||||||
|
@ -135,29 +134,6 @@ proxy
|
||||||
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
||||||
warcprox for archival crawling.
|
warcprox for archival crawling.
|
||||||
|
|
||||||
enable_warcprox_features
|
|
||||||
------------------------
|
|
||||||
+-----------------------+---------+----------+---------+
|
|
||||||
| scope | type | required | default |
|
|
||||||
+=======================+=========+==========+=========+
|
|
||||||
| seed-level, top-level | boolean | no | false |
|
|
||||||
+-----------------------+---------+----------+---------+
|
|
||||||
If true for a given seed, and the seed is configured to use a proxy, enables
|
|
||||||
special features that assume the proxy is an instance of warcprox. As of this
|
|
||||||
writing, the special features that are enabled are:
|
|
||||||
|
|
||||||
- sending screenshots and thumbnails to warcprox using a WARCPROX_WRITE_RECORD
|
|
||||||
request
|
|
||||||
- sending youtube-dl metadata json to warcprox using a WARCPROX_WRITE_RECORD
|
|
||||||
request
|
|
||||||
|
|
||||||
See the warcprox docs for information on the WARCPROX_WRITE_RECORD method (XXX
|
|
||||||
not yet written).
|
|
||||||
|
|
||||||
*Note that if* ``warcprox_meta`` *and* ``proxy`` *are configured, the
|
|
||||||
Warcprox-Meta header will be sent even if* ``enable_warcprox_features`` *is not
|
|
||||||
set.*
|
|
||||||
|
|
||||||
ignore_robots
|
ignore_robots
|
||||||
-------------
|
-------------
|
||||||
+-----------------------+---------+----------+---------+
|
+-----------------------+---------+----------+---------+
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b10.dev218',
|
version='1.1b10.dev219',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -77,7 +77,7 @@ setuptools.setup(
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||||
'easy': [
|
'easy': [
|
||||||
'warcprox>=2.1b1.dev57',
|
'warcprox>=2.1b1.dev60',
|
||||||
'pywb',
|
'pywb',
|
||||||
'flask>=0.11',
|
'flask>=0.11',
|
||||||
'gunicorn'
|
'gunicorn'
|
||||||
|
|
|
@ -30,6 +30,8 @@ import brozzler
|
||||||
import datetime
|
import datetime
|
||||||
import requests
|
import requests
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import http.server
|
||||||
|
import logging
|
||||||
|
|
||||||
def start_service(service):
|
def start_service(service):
|
||||||
subprocess.check_call(['sudo', 'service', service, 'start'])
|
subprocess.check_call(['sudo', 'service', service, 'start'])
|
||||||
|
@ -113,7 +115,7 @@ def test_brozzle_site(httpd):
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
||||||
'proxy': 'localhost:8000', 'enable_warcprox_features': True,
|
'proxy': 'localhost:8000',
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
|
||||||
# the two pages we expect to be crawled
|
# the two pages we expect to be crawled
|
||||||
|
@ -180,11 +182,89 @@ def test_brozzle_site(httpd):
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.headers['content-type'] == 'image/jpeg'
|
assert response.headers['content-type'] == 'image/jpeg'
|
||||||
|
|
||||||
def test_warcprox_selection(httpd):
|
def test_proxy_warcprox(httpd):
|
||||||
''' When enable_warcprox_features is true, brozzler is expected to choose
|
'''Test --proxy with proxy that happens to be warcprox'''
|
||||||
and instance of warcprox '''
|
try:
|
||||||
|
stop_service('brozzler-worker')
|
||||||
|
_test_proxy_setting(
|
||||||
|
httpd, proxy='localhost:8000', warcprox_auto=False,
|
||||||
|
is_warcprox=True)
|
||||||
|
finally:
|
||||||
|
start_service('brozzler-worker')
|
||||||
|
|
||||||
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
|
def test_proxy_non_warcprox(httpd):
|
||||||
|
'''Test --proxy with proxy that happens not to be warcprox'''
|
||||||
|
class DumbProxyRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
|
def do_HEAD(self):
|
||||||
|
if not hasattr(self.server, 'requests'):
|
||||||
|
self.server.requests = []
|
||||||
|
logging.info('%s %s', self.command, self.path)
|
||||||
|
self.server.requests.append('%s %s' % (self.command, self.path))
|
||||||
|
response = urllib.request.urlopen(self.path)
|
||||||
|
self.wfile.write(('HTTP/1.0 %s %s\r\n' % (
|
||||||
|
response.code, response.reason)).encode('ascii'))
|
||||||
|
for header in response.getheaders():
|
||||||
|
self.wfile.write(('%s: %s\r\n' % (
|
||||||
|
header[0], header[1])).encode('ascii'))
|
||||||
|
self.wfile.write(b'\r\n')
|
||||||
|
return response
|
||||||
|
def do_GET(self):
|
||||||
|
response = self.do_HEAD()
|
||||||
|
self.copyfile(response, self.wfile)
|
||||||
|
def do_WARCPROX_WRITE_RECORD(self):
|
||||||
|
if not hasattr(self.server, 'requests'):
|
||||||
|
self.server.requests = []
|
||||||
|
logging.info('%s %s', self.command, self.path)
|
||||||
|
self.send_error(400)
|
||||||
|
|
||||||
|
proxy = http.server.HTTPServer(('localhost', 0), DumbProxyRequestHandler)
|
||||||
|
th = threading.Thread(name='dumb-proxy', target=proxy.serve_forever)
|
||||||
|
th.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
stop_service('brozzler-worker')
|
||||||
|
_test_proxy_setting(
|
||||||
|
httpd, proxy='localhost:%s' % proxy.server_port,
|
||||||
|
warcprox_auto=False, is_warcprox=False)
|
||||||
|
finally:
|
||||||
|
start_service('brozzler-worker')
|
||||||
|
assert len(proxy.requests) <= 15
|
||||||
|
assert proxy.requests.count('GET /status') == 1
|
||||||
|
assert ('GET http://localhost:%s/site1/' % httpd.server_port) in proxy.requests
|
||||||
|
assert ('GET http://localhost:%s/site1/file1.txt' % httpd.server_port) in proxy.requests
|
||||||
|
assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == []
|
||||||
|
|
||||||
|
proxy.shutdown()
|
||||||
|
th.join()
|
||||||
|
|
||||||
|
def test_no_proxy(httpd):
|
||||||
|
try:
|
||||||
|
stop_service('brozzler-worker')
|
||||||
|
_test_proxy_setting(
|
||||||
|
httpd, proxy=None, warcprox_auto=False, is_warcprox=False)
|
||||||
|
finally:
|
||||||
|
start_service('brozzler-worker')
|
||||||
|
# XXX how to check that no proxy was used?
|
||||||
|
|
||||||
|
def test_warcprox_auto(httpd):
|
||||||
|
'''Test --warcprox-auto'''
|
||||||
|
try:
|
||||||
|
stop_service('brozzler-worker')
|
||||||
|
_test_proxy_setting(
|
||||||
|
httpd, proxy=None, warcprox_auto=True, is_warcprox=True)
|
||||||
|
finally:
|
||||||
|
start_service('brozzler-worker')
|
||||||
|
|
||||||
|
def test_proxy_conflict():
|
||||||
|
with pytest.raises(AssertionError) as excinfo:
|
||||||
|
worker = brozzler.worker.BrozzlerWorker(
|
||||||
|
None, None, warcprox_auto=True, proxy='localhost:12345')
|
||||||
|
|
||||||
|
def _test_proxy_setting(
|
||||||
|
httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
|
||||||
|
test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % (
|
||||||
|
proxy, warcprox_auto, is_warcprox,
|
||||||
|
datetime.datetime.utcnow().isoformat())
|
||||||
|
|
||||||
# the two pages we expect to be crawled
|
# the two pages we expect to be crawled
|
||||||
page1 = 'http://localhost:%s/site1/' % httpd.server_port
|
page1 = 'http://localhost:%s/site1/' % httpd.server_port
|
||||||
|
@ -192,35 +272,36 @@ def test_warcprox_selection(httpd):
|
||||||
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
|
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
|
||||||
|
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
|
service_registry = doublethink.ServiceRegistry(rr)
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
||||||
'enable_warcprox_features': True,
|
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
|
||||||
# so we can examine rethinkdb before it does anything
|
|
||||||
try:
|
|
||||||
stop_service('brozzler-worker')
|
|
||||||
assert site.id is None
|
assert site.id is None
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
brozzler.new_site(frontier, site)
|
brozzler.new_site(frontier, site)
|
||||||
assert site.id is not None
|
assert site.id is not None
|
||||||
assert len(list(frontier.site_pages(site.id))) == 1
|
assert len(list(frontier.site_pages(site.id))) == 1
|
||||||
finally:
|
|
||||||
start_service('brozzler-worker')
|
|
||||||
|
|
||||||
# check proxy is set in rethink
|
worker = brozzler.worker.BrozzlerWorker(
|
||||||
start = time.time()
|
frontier, service_registry, max_browsers=1,
|
||||||
while not site.proxy and time.time() - start < 20:
|
chrome_exe=brozzler.suggest_default_chrome_exe(),
|
||||||
time.sleep(0.5)
|
warcprox_auto=warcprox_auto, proxy=proxy)
|
||||||
site.refresh()
|
browser = worker._browser_pool.acquire()
|
||||||
assert site.proxy[-5:] == ':8000'
|
worker.brozzle_site(browser, site)
|
||||||
|
worker._browser_pool.release(browser)
|
||||||
|
|
||||||
# the site should be brozzled fairly quickly
|
# check proxy is set
|
||||||
start = time.time()
|
|
||||||
while site.status != 'FINISHED' and time.time() - start < 300:
|
|
||||||
time.sleep(0.5)
|
|
||||||
site.refresh()
|
|
||||||
assert site.status == 'FINISHED'
|
assert site.status == 'FINISHED'
|
||||||
|
if warcprox_auto:
|
||||||
|
assert site.proxy[-5:] == ':8000'
|
||||||
|
else:
|
||||||
|
assert not site.proxy
|
||||||
|
site.refresh() # check that these things were persisted
|
||||||
|
assert site.status == 'FINISHED'
|
||||||
|
if warcprox_auto:
|
||||||
|
assert site.proxy[-5:] == ':8000'
|
||||||
|
else:
|
||||||
|
assert not site.proxy
|
||||||
|
|
||||||
# check that we got the two pages we expected
|
# check that we got the two pages we expected
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
|
@ -234,12 +315,12 @@ def test_warcprox_selection(httpd):
|
||||||
captures = rr.table('captures').filter({'test_id':test_id}).run()
|
captures = rr.table('captures').filter({'test_id':test_id}).run()
|
||||||
captures_by_url = {
|
captures_by_url = {
|
||||||
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
|
if is_warcprox:
|
||||||
assert robots in captures_by_url
|
assert robots in captures_by_url
|
||||||
assert page1 in captures_by_url
|
assert page1 in captures_by_url
|
||||||
assert page2 in captures_by_url
|
assert page2 in captures_by_url
|
||||||
assert 'screenshot:%s' % page1 in captures_by_url
|
assert 'screenshot:%s' % page1 in captures_by_url
|
||||||
assert 'thumbnail:%s' % page1 in captures_by_url
|
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||||
# no screenshots of plaintext
|
|
||||||
|
|
||||||
# check pywb
|
# check pywb
|
||||||
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||||
|
@ -247,13 +328,15 @@ def test_warcprox_selection(httpd):
|
||||||
expected_payload = open(os.path.join(
|
expected_payload = open(os.path.join(
|
||||||
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
|
||||||
assert requests.get(wb_url).content == expected_payload
|
assert requests.get(wb_url).content == expected_payload
|
||||||
|
else:
|
||||||
|
assert captures_by_url == {}
|
||||||
|
|
||||||
def test_obey_robots(httpd):
|
def test_obey_robots(httpd):
|
||||||
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
||||||
'proxy': 'localhost:8000', 'enable_warcprox_features': True,
|
'proxy': 'localhost:8000',
|
||||||
'user_agent': 'im a badbot', # robots.txt blocks badbot
|
'user_agent': 'im a badbot', # robots.txt blocks badbot
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
|
||||||
|
@ -306,7 +389,7 @@ def test_login(httpd):
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site2/' % httpd.server_port,
|
'seed': 'http://localhost:%s/site2/' % httpd.server_port,
|
||||||
'proxy': 'localhost:8000', 'enable_warcprox_features': True,
|
'proxy': 'localhost:8000',
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
|
||||||
'username': 'test_username', 'password': 'test_password'})
|
'username': 'test_username', 'password': 'test_password'})
|
||||||
|
|
||||||
|
@ -347,7 +430,7 @@ def test_seed_redirect(httpd):
|
||||||
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
|
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
|
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
|
||||||
'proxy': 'localhost:8000', 'enable_warcprox_features': True,
|
'proxy': 'localhost:8000',
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port
|
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port
|
||||||
|
|
||||||
|
|
|
@ -314,9 +314,7 @@ def test_field_defaults():
|
||||||
|
|
||||||
# site
|
# site
|
||||||
brozzler.Site.table_ensure(rr)
|
brozzler.Site.table_ensure(rr)
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {'seed': 'http://example.com/'})
|
||||||
'seed': 'http://example.com/', 'enable_warcprox_features': True})
|
|
||||||
assert site.enable_warcprox_features is True
|
|
||||||
assert site.id is None
|
assert site.id is None
|
||||||
assert site.scope
|
assert site.scope
|
||||||
assert site.scope['surt'] == 'http://(com,example,)/'
|
assert site.scope['surt'] == 'http://(com,example,)/'
|
||||||
|
@ -325,15 +323,12 @@ def test_field_defaults():
|
||||||
assert site.scope
|
assert site.scope
|
||||||
|
|
||||||
tite = brozzler.Site.load(rr, site.id)
|
tite = brozzler.Site.load(rr, site.id)
|
||||||
assert tite.enable_warcprox_features is True
|
|
||||||
assert tite.id == site.id
|
assert tite.id == site.id
|
||||||
assert tite.scope == site.scope
|
assert tite.scope == site.scope
|
||||||
tite.save()
|
tite.save()
|
||||||
assert tite.enable_warcprox_features is True
|
|
||||||
assert tite.id == site.id
|
assert tite.id == site.id
|
||||||
assert tite.scope == site.scope
|
assert tite.scope == site.scope
|
||||||
tite.refresh()
|
tite.refresh()
|
||||||
assert tite.enable_warcprox_features is True
|
|
||||||
assert tite.id == site.id
|
assert tite.id == site.id
|
||||||
assert tite.scope == site.scope
|
assert tite.scope == site.scope
|
||||||
|
|
||||||
|
|
|
@ -33,8 +33,7 @@ Then you can run brozzler-new-site:
|
||||||
::
|
::
|
||||||
|
|
||||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
|
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
|
||||||
--proxy=localhost:8000 --enable-warcprox-features \
|
--proxy=localhost:8000 http://example.com/
|
||||||
http://example.com/
|
|
||||||
|
|
||||||
|
|
||||||
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||||
|
@ -44,7 +43,6 @@ Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||||
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
|
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
|
||||||
id: job1
|
id: job1
|
||||||
proxy: localhost:8000 # point at warcprox for archiving
|
proxy: localhost:8000 # point at warcprox for archiving
|
||||||
enable_warcprox_features: true
|
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.org/
|
- url: https://example.org/
|
||||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
|
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
|
||||||
|
|
|
@ -77,7 +77,7 @@ def main(argv=[]):
|
||||||
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
||||||
'/home/vagrant/brozzler-ve34/bin/python '
|
'/home/vagrant/brozzler-ve34/bin/python '
|
||||||
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
|
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
|
||||||
'--proxy=localhost:8000 --enable-warcprox-features %s %s') % (
|
'--proxy=localhost:8000 %s %s') % (
|
||||||
' '.join(options), args.seed)
|
' '.join(options), args.seed)
|
||||||
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue