Merge pull request #67 from internetarchive/skip_youtube_dl

skip_youtube_dl
This commit is contained in:
Noah Levitt 2017-09-29 15:10:10 -07:00 committed by GitHub
commit 554dbe821b
3 changed files with 18 additions and 6 deletions

View File

@ -381,7 +381,7 @@ class Browser:
on_request=None, on_response=None, on_screenshot=None,
username=None, password=None, hashtags=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
page_timeout=300, behavior_timeout=900):
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
'''
Browses page in browser.

15
brozzler/cli.py Normal file → Executable file
View File

@ -160,6 +160,9 @@ def brozzle_page(argv=None):
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-youtube-dl', dest='skip_youtube_dl',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -174,7 +177,8 @@ def brozzle_page(argv=None):
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
@ -190,7 +194,8 @@ def brozzle_page(argv=None):
try:
browser.start(proxy=args.proxy)
outlinks = worker.brozzle_page(
browser, site, page, on_screenshot=on_screenshot)
browser, site, page, on_screenshot=on_screenshot,
enable_youtube_dl=not args.skip_youtube_dl)
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
except brozzler.ReachedLimit as e:
logging.error('reached limit %s', e)
@ -313,6 +318,9 @@ def brozzler_worker(argv=None):
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-youtube-dl', dest='skip_youtube_dl',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -347,7 +355,8 @@ def brozzler_worker(argv=None):
chrome_exe=args.chrome_exe, proxy=args.proxy,
warcprox_auto=args.warcprox_auto,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())

View File

@ -105,7 +105,7 @@ class BrozzlerWorker:
self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
page_timeout=300, behavior_timeout=900):
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -116,6 +116,7 @@ class BrozzlerWorker:
self._proxy_is_warcprox = None
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
self._page_timeout = page_timeout
self._behavior_timeout = behavior_timeout
@ -420,6 +421,7 @@ class BrozzlerWorker:
on_request=on_request, hashtags=page.hashtags,
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl,
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout)
if final_page_url != page.url:
@ -485,7 +487,8 @@ class BrozzlerWorker:
page.blocked_by_robots = True
self._frontier.completed_page(site, page)
else:
outlinks = self.brozzle_page(browser, site, page)
outlinks = self.brozzle_page(browser, site, page,
enable_youtube_dl=not self._skip_youtube_dl)
self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks(
site, page, outlinks)