mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
--skip-extract-outlinks, --skip-visit-hashtags
Brozzler always did these actions. We make it possible to skip them with this MR. Options are passed to `brozzler-worker`. This feature is useful for tasks where we just need to retrieve a specific page and we don't need to extract outlinks to continue crawling.
This commit is contained in:
parent
261e7977ad
commit
89877670a4
@ -379,7 +379,8 @@ class Browser:
|
|||||||
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
||||||
user_agent=None, behavior_parameters=None,
|
user_agent=None, behavior_parameters=None,
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
username=None, password=None, hashtags=None):
|
username=None, password=None, hashtags=None,
|
||||||
|
skip_extract_outlinks=False, skip_visit_hashtags=False):
|
||||||
'''
|
'''
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
@ -447,8 +448,12 @@ class Browser:
|
|||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters)
|
page_url, behavior_parameters)
|
||||||
self.run_behavior(behavior_script, timeout=900)
|
self.run_behavior(behavior_script, timeout=900)
|
||||||
outlinks = self.extract_outlinks()
|
if skip_extract_outlinks:
|
||||||
self.visit_hashtags(page_url, hashtags, outlinks)
|
outlinks = []
|
||||||
|
else:
|
||||||
|
outlinks = self.extract_outlinks()
|
||||||
|
if not skip_visit_hashtags:
|
||||||
|
self.visit_hashtags(page_url, hashtags, outlinks)
|
||||||
final_page_url = self.url()
|
final_page_url = self.url()
|
||||||
return final_page_url, outlinks
|
return final_page_url, outlinks
|
||||||
except brozzler.ReachedLimit:
|
except brozzler.ReachedLimit:
|
||||||
|
@ -299,6 +299,12 @@ def brozzler_worker(argv=None):
|
|||||||
help=(
|
help=(
|
||||||
'when needed, choose an available instance of warcprox from '
|
'when needed, choose an available instance of warcprox from '
|
||||||
'the rethinkdb service registry'))
|
'the rethinkdb service registry'))
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||||
|
action='store_true', help='extract page outlinks by default')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||||
|
action='store_true', help='visit page hashtags by default')
|
||||||
add_common_options(arg_parser, argv)
|
add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
@ -331,7 +337,9 @@ def brozzler_worker(argv=None):
|
|||||||
worker = brozzler.worker.BrozzlerWorker(
|
worker = brozzler.worker.BrozzlerWorker(
|
||||||
frontier, service_registry, max_browsers=int(args.max_browsers),
|
frontier, service_registry, max_browsers=int(args.max_browsers),
|
||||||
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
||||||
warcprox_auto=args.warcprox_auto)
|
warcprox_auto=args.warcprox_auto,
|
||||||
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
|
skip_visit_hashtags=args.skip_visit_hashtags)
|
||||||
|
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
||||||
|
@ -103,7 +103,8 @@ class BrozzlerWorker:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, frontier, service_registry=None, max_browsers=1,
|
self, frontier, service_registry=None, max_browsers=1,
|
||||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None):
|
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||||
|
skip_extract_outlinks=False, skip_visit_hashtags=False):
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
@ -112,6 +113,8 @@ class BrozzlerWorker:
|
|||||||
self._proxy = proxy
|
self._proxy = proxy
|
||||||
assert not (warcprox_auto and proxy)
|
assert not (warcprox_auto and proxy)
|
||||||
self._proxy_is_warcprox = None
|
self._proxy_is_warcprox = None
|
||||||
|
self._skip_extract_outlinks = skip_extract_outlinks
|
||||||
|
self._skip_visit_hashtags = skip_visit_hashtags
|
||||||
|
|
||||||
self._browser_pool = brozzler.browser.BrowserPool(
|
self._browser_pool = brozzler.browser.BrowserPool(
|
||||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||||
@ -406,7 +409,9 @@ class BrozzlerWorker:
|
|||||||
username=site.get('username'), password=site.get('password'),
|
username=site.get('username'), password=site.get('password'),
|
||||||
user_agent=site.get('user_agent'),
|
user_agent=site.get('user_agent'),
|
||||||
on_screenshot=_on_screenshot, on_response=_on_response,
|
on_screenshot=_on_screenshot, on_response=_on_response,
|
||||||
hashtags=page.hashtags)
|
hashtags=page.hashtags,
|
||||||
|
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||||
|
skip_visit_hashtags=self._skip_visit_hashtags)
|
||||||
if final_page_url != page.url:
|
if final_page_url != page.url:
|
||||||
page.note_redirect(final_page_url)
|
page.note_redirect(final_page_url)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
Loading…
x
Reference in New Issue
Block a user