mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Merge pull request #67 from internetarchive/skip_youtube_dl
skip_youtube_dl
This commit is contained in:
commit
554dbe821b
@ -381,7 +381,7 @@ class Browser:
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
username=None, password=None, hashtags=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
page_timeout=300, behavior_timeout=900):
|
||||
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
||||
'''
|
||||
Browses page in browser.
|
||||
|
||||
|
15
brozzler/cli.py
Normal file → Executable file
15
brozzler/cli.py
Normal file → Executable file
@ -160,6 +160,9 @@ def brozzle_page(argv=None):
|
||||
arg_parser.add_argument(
|
||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--skip-youtube-dl', dest='skip_youtube_dl',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
add_common_options(arg_parser, argv)
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
@ -174,7 +177,8 @@ def brozzle_page(argv=None):
|
||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||
skip_youtube_dl=args.skip_youtube_dl)
|
||||
|
||||
def on_screenshot(screenshot_png):
|
||||
OK_CHARS = (string.ascii_letters + string.digits)
|
||||
@ -190,7 +194,8 @@ def brozzle_page(argv=None):
|
||||
try:
|
||||
browser.start(proxy=args.proxy)
|
||||
outlinks = worker.brozzle_page(
|
||||
browser, site, page, on_screenshot=on_screenshot)
|
||||
browser, site, page, on_screenshot=on_screenshot,
|
||||
enable_youtube_dl=not args.skip_youtube_dl)
|
||||
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
|
||||
except brozzler.ReachedLimit as e:
|
||||
logging.error('reached limit %s', e)
|
||||
@ -313,6 +318,9 @@ def brozzler_worker(argv=None):
|
||||
arg_parser.add_argument(
|
||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--skip-youtube-dl', dest='skip_youtube_dl',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
add_common_options(arg_parser, argv)
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
@ -347,7 +355,8 @@ def brozzler_worker(argv=None):
|
||||
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
||||
warcprox_auto=args.warcprox_auto,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||
skip_youtube_dl=args.skip_youtube_dl)
|
||||
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
||||
|
@ -105,7 +105,7 @@ class BrozzlerWorker:
|
||||
self, frontier, service_registry=None, max_browsers=1,
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
page_timeout=300, behavior_timeout=900):
|
||||
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
@ -116,6 +116,7 @@ class BrozzlerWorker:
|
||||
self._proxy_is_warcprox = None
|
||||
self._skip_extract_outlinks = skip_extract_outlinks
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
self._skip_youtube_dl = skip_youtube_dl
|
||||
self._page_timeout = page_timeout
|
||||
self._behavior_timeout = behavior_timeout
|
||||
|
||||
@ -420,6 +421,7 @@ class BrozzlerWorker:
|
||||
on_request=on_request, hashtags=page.hashtags,
|
||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||
skip_youtube_dl=self._skip_youtube_dl,
|
||||
page_timeout=self._page_timeout,
|
||||
behavior_timeout=self._behavior_timeout)
|
||||
if final_page_url != page.url:
|
||||
@ -485,7 +487,8 @@ class BrozzlerWorker:
|
||||
page.blocked_by_robots = True
|
||||
self._frontier.completed_page(site, page)
|
||||
else:
|
||||
outlinks = self.brozzle_page(browser, site, page)
|
||||
outlinks = self.brozzle_page(browser, site, page,
|
||||
enable_youtube_dl=not self._skip_youtube_dl)
|
||||
self._frontier.completed_page(site, page)
|
||||
self._frontier.scope_and_schedule_outlinks(
|
||||
site, page, outlinks)
|
||||
|
Loading…
x
Reference in New Issue
Block a user