From 4ddd76f5428b9f70a1a6a38f172dd94e32219fdf Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 22 Jan 2018 12:47:26 -0800 Subject: [PATCH] pass canonicalized url to youtube-dl avoids this kind of error: wbgrp-svc294 2018-01-19 21:04:43,973 648 ERROR BrozzlingThread:39295 youtube_dl.to_stderr(YoutubeDL.py:514) ERROR: Unable to download webpage: (caused by URLError('no host given',)) wbgrp-svc294 2018-01-19 21:04:43,973 648 ERROR BrozzlingThread:39295 root.brozzle_site(worker.py:521) proxy error (site.proxy=wbgrp-svc400.us.archive.org:8002), will try to choose a healthy instance next time site is brozzled: youtube-dl hit apparent proxy error from https:/www.laphil.com/press1718 --- brozzler/worker.py | 6 +++++- setup.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index ba77ba7..21cd5ae 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -290,8 +290,12 @@ class BrozzlerWorker: def _try_youtube_dl(self, ydl, site, page): try: self.logger.info("trying youtube-dl on {}".format(page)) + with brozzler.thread_accept_exceptions(): - info = ydl.extract_info(page.url) + # we do whatwg canonicalization here to avoid "" resulting in ProxyError + # needs automated test + info = ydl.extract_info(urlcanon.whatwg(page.url)) self._remember_videos(page, ydl.brozzler_spy) # logging.info('XXX %s', json.dumps(info)) if self._using_warcprox(site): diff --git a/setup.py b/setup.py index 3164919..f21dbef 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev277', + version='1.1b12.dev278', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',