diff --git a/brozzler/browser.py b/brozzler/browser.py index b921126..8e6b083 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -381,7 +381,7 @@ class Browser: on_request=None, on_response=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, - page_timeout=300, behavior_timeout=900): + skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): ''' Browses page in browser. diff --git a/brozzler/cli.py b/brozzler/cli.py old mode 100644 new mode 100755 index 38ea689..f0e33ac --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -160,6 +160,9 @@ def brozzle_page(argv=None): arg_parser.add_argument( '--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) + arg_parser.add_argument( + '--skip-youtube-dl', dest='skip_youtube_dl', + action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -174,7 +177,8 @@ def brozzle_page(argv=None): page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy, skip_extract_outlinks=args.skip_extract_outlinks, - skip_visit_hashtags=args.skip_visit_hashtags) + skip_visit_hashtags=args.skip_visit_hashtags, + skip_youtube_dl=args.skip_youtube_dl) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) @@ -190,7 +194,8 @@ def brozzle_page(argv=None): try: browser.start(proxy=args.proxy) outlinks = worker.brozzle_page( - browser, site, page, on_screenshot=on_screenshot) + browser, site, page, on_screenshot=on_screenshot, + enable_youtube_dl=not args.skip_youtube_dl) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) @@ -313,6 +318,9 @@ def brozzler_worker(argv=None): arg_parser.add_argument( '--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) + arg_parser.add_argument( + '--skip-youtube-dl', dest='skip_youtube_dl', + action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) @@ -347,7 +355,8 @@ def brozzler_worker(argv=None): chrome_exe=args.chrome_exe, proxy=args.proxy, warcprox_auto=args.warcprox_auto, skip_extract_outlinks=args.skip_extract_outlinks, - skip_visit_hashtags=args.skip_visit_hashtags) + skip_visit_hashtags=args.skip_visit_hashtags, + skip_youtube_dl=args.skip_youtube_dl) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s,f: worker.stop()) diff --git a/brozzler/worker.py b/brozzler/worker.py index 6253b63..205d730 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -105,7 +105,7 @@ class BrozzlerWorker: self, frontier, service_registry=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, skip_extract_outlinks=False, skip_visit_hashtags=False, - page_timeout=300, behavior_timeout=900): + skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -116,6 +116,7 @@ class BrozzlerWorker: self._proxy_is_warcprox = None self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags + self._skip_youtube_dl = skip_youtube_dl self._page_timeout = page_timeout self._behavior_timeout = behavior_timeout @@ -420,6 +421,7 @@ class BrozzlerWorker: on_request=on_request, hashtags=page.hashtags, skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, + skip_youtube_dl=self._skip_youtube_dl, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout) if final_page_url != page.url: @@ -485,7 +487,8 @@ class BrozzlerWorker: page.blocked_by_robots = True self._frontier.completed_page(site, page) else: - outlinks = self.brozzle_page(browser, site, page) + outlinks = self.brozzle_page(browser, site, page, + enable_youtube_dl=not self._skip_youtube_dl) self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks)