skip_youtube_dl

This commit is contained in:
Barbara Miller 2017-09-12 17:58:28 -07:00
parent ec847e48bc
commit 5e7b3b73dd
3 changed files with 20 additions and 6 deletions

View File

@ -381,7 +381,7 @@ class Browser:
on_request=None, on_response=None, on_screenshot=None,
username=None, password=None, hashtags=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
page_timeout=300, behavior_timeout=900):
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
'''
Browses page in browser.

16
brozzler/cli.py Normal file → Executable file
View File

@ -160,6 +160,9 @@ def brozzle_page(argv=None):
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-youtube-dl', dest='skip_youtube_dl',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -174,7 +177,8 @@ def brozzle_page(argv=None):
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
@ -189,8 +193,10 @@ def brozzle_page(argv=None):
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
try:
browser.start(proxy=args.proxy)
enable_youtube_dl = False if args.skip_youtube_dl else True
outlinks = worker.brozzle_page(
browser, site, page, on_screenshot=on_screenshot)
browser, site, page, on_screenshot=on_screenshot,
enable_youtube_dl=enable_youtube_dl)
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
except brozzler.ReachedLimit as e:
logging.error('reached limit %s', e)
@ -313,6 +319,9 @@ def brozzler_worker(argv=None):
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-youtube-dl', dest='skip_youtube_dl',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -347,7 +356,8 @@ def brozzler_worker(argv=None):
chrome_exe=args.chrome_exe, proxy=args.proxy,
warcprox_auto=args.warcprox_auto,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())

View File

@ -105,7 +105,7 @@ class BrozzlerWorker:
self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
page_timeout=300, behavior_timeout=900):
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -116,6 +116,7 @@ class BrozzlerWorker:
self._proxy_is_warcprox = None
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
self._page_timeout = page_timeout
self._behavior_timeout = behavior_timeout
@ -420,6 +421,7 @@ class BrozzlerWorker:
on_request=on_request, hashtags=page.hashtags,
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl,
page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout)
if final_page_url != page.url:
@ -485,7 +487,9 @@ class BrozzlerWorker:
page.blocked_by_robots = True
self._frontier.completed_page(site, page)
else:
outlinks = self.brozzle_page(browser, site, page)
enable_youtube_dl = False if self._skip_youtube_dl else True
outlinks = self.brozzle_page(browser, site, page,
enable_youtube_dl=enable_youtube_dl)
self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks(
site, page, outlinks)