mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-16 10:59:12 -04:00
fix brozzler-easy so that warcprox features are enabled automatically (feature was already there but broken)
This commit is contained in:
parent
603956ec41
commit
aae810cc6e
4 changed files with 8 additions and 17 deletions
|
@ -86,7 +86,7 @@ def _add_proxy_options(arg_parser):
|
||||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--enable-warcprox-features', dest='enable_warcprox_features',
|
'--enable-warcprox-features', dest='enable_warcprox_features',
|
||||||
action='store_true', help=(
|
action='store_true', default=None, help=(
|
||||||
'enable special features that assume the configured proxy is '
|
'enable special features that assume the configured proxy is '
|
||||||
'warcprox'))
|
'warcprox'))
|
||||||
|
|
||||||
|
@ -159,14 +159,7 @@ def brozzle_page():
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--password', dest='password', default=None,
|
'--password', dest='password', default=None,
|
||||||
help='use this password to try to log in if a login form is found')
|
help='use this password to try to log in if a login form is found')
|
||||||
arg_parser.add_argument(
|
_add_proxy_options(arg_parser)
|
||||||
'--proxy', dest='proxy', default=None,
|
|
||||||
help='http proxy')
|
|
||||||
arg_parser.add_argument(
|
|
||||||
'--enable-warcprox-features', dest='enable_warcprox_features',
|
|
||||||
action='store_true', help=(
|
|
||||||
'enable special features that assume the configured proxy '
|
|
||||||
'is warcprox'))
|
|
||||||
add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
|
|
@ -33,8 +33,6 @@ class Site(doublethink.Document):
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "status" in self:
|
if not "status" in self:
|
||||||
self.status = "ACTIVE"
|
self.status = "ACTIVE"
|
||||||
if not "enable_warcprox_features" in self:
|
|
||||||
self.enable_warcprox_features = False
|
|
||||||
if not "claimed" in self:
|
if not "claimed" in self:
|
||||||
self.claimed = False
|
self.claimed = False
|
||||||
if not "last_disclaimed" in self:
|
if not "last_disclaimed" in self:
|
||||||
|
|
|
@ -124,9 +124,7 @@ class BrozzlerWorker:
|
||||||
return site.proxy
|
return site.proxy
|
||||||
elif self._default_proxy:
|
elif self._default_proxy:
|
||||||
return self._default_proxy
|
return self._default_proxy
|
||||||
elif self._service_registry and (
|
elif self._service_registry and self._enable_warcprox_features(site):
|
||||||
site.enable_warcprox_features
|
|
||||||
or self._default_enable_warcprox_features):
|
|
||||||
svc = self._service_registry.available_service('warcprox')
|
svc = self._service_registry.available_service('warcprox')
|
||||||
if svc is None:
|
if svc is None:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
|
@ -142,6 +140,8 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
|
|
||||||
def _enable_warcprox_features(self, site):
|
def _enable_warcprox_features(self, site):
|
||||||
|
if not self._proxy(site):
|
||||||
|
return False
|
||||||
if site.enable_warcprox_features is not None:
|
if site.enable_warcprox_features is not None:
|
||||||
return site.enable_warcprox_features
|
return site.enable_warcprox_features
|
||||||
else:
|
else:
|
||||||
|
@ -227,7 +227,7 @@ class BrozzlerWorker:
|
||||||
info = ydl.extract_info(page.url)
|
info = ydl.extract_info(page.url)
|
||||||
self._remember_videos(page, ydl.brozzler_spy)
|
self._remember_videos(page, ydl.brozzler_spy)
|
||||||
# logging.info('XXX %s', json.dumps(info))
|
# logging.info('XXX %s', json.dumps(info))
|
||||||
if self._proxy(site) and self._enable_warcprox_features(site):
|
if self._enable_warcprox_features(site):
|
||||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
"sending WARCPROX_WRITE_RECORD request to warcprox "
|
||||||
|
@ -306,7 +306,7 @@ class BrozzlerWorker:
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_png):
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_png)
|
||||||
if self._proxy(site) and self._enable_warcprox_features(site):
|
if self._enable_warcprox_features(site):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||||
"screenshot for %s", self._proxy(site), page)
|
"screenshot for %s", self._proxy(site), page)
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev214',
|
version='1.1b9.dev215',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue