initial youtube-dl support, including saving youtube-dl derived json with warcprox by sending a PUTMETA request, if new option --enable-warcprox-features is enabled

2025-11-30 20:27:05 -05:00 · 2015-07-14 18:57:45 -07:00 · 2015-07-14 18:57:45 -07:00 · 9b5da57d7e
commit 9b5da57d7e
parent fd0c3322ee
1 changed files with 49 additions and 8 deletions
--- a/bin/brozzler-worker
+++ b/bin/brozzler-worker
@ -15,6 +15,8 @@ from brozzler import hq
 import pprint
 import traceback
 import youtube_dl
+import urllib.request
+import json

 arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
        description='crawl-url - browse urls, follow links',
@ -29,6 +31,8 @@ arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
        help='configure browser to use specified proxy server')
 arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
        action='store_true', help='configure browser to ignore certificate errors')
+arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
+        action='store_true', help='enable special features that assume the configured proxy is warcprox')
 arg_parser.add_argument('-v', '--verbose', dest='log_level',
        action="store_const", default=logging.INFO, const=logging.DEBUG)
 arg_parser.add_argument('--version', action='version',
@ -63,12 +67,50 @@ def disclaim_site(site):
        logging.info("putting {} on queue {}".format(site, q.queue.name))
        q.put(site.to_dict())

-ydl_extractors = youtube_dl.extractor.gen_extractors()
-def ydl_suitable(url):
-    for ie in ydl_extractors:
-        if ie.suitable(url):
-            return True
-    return False
+ydl_opts = {
+    "outtmpl": "/dev/null",
+    "verbose": True,
+    "retries": 3,
+    "logger": logging,
+    "nocheckcertificate": True,
+    "hls_prefer_native": True,
+    "noprogress": True,
+    "nopart": True,
+}
+if args.proxy_server:
+    ydl_opts["proxy"] = "http://{}".format(args.proxy_server)
+    # see https://github.com/rg3/youtube-dl/issues/6087
+    os.environ["http_proxy"] = "http://{}".format(args.proxy_server)
+ydl = youtube_dl.YoutubeDL(ydl_opts)
+
+def putmeta(url, content_type, payload):
+    assert args.enable_warcprox_features
+    request = urllib.request.Request(url, method="PUTMETA", 
+            headers={"Content-Type":content_type}, data=payload)
+
+    # XXX evil hack to keep urllib from trying to tunnel https urls here
+    request.type = "http"
+    request.set_proxy("localhost:8000", "http")
+
+    try:
+        with urllib.request.urlopen(request) as response:
+            if response.status != 204:
+                logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(response.status, response.reason))
+    except urllib.error.HTTPError as e:
+        logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))
+
+def try_youtube_dl(site, crawl_url):
+    try:
+        logging.info("trying youtube-dl on {}".format(crawl_url))
+        info = ydl.extract_info(crawl_url.url)
+        if args.proxy_server and args.enable_warcprox_features:
+            info_json = json.dumps(info, sort_keys=True, indent=4)
+            logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
+            putmeta(url=crawl_url.url, 
+                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
+                    payload=info_json.encode("utf-8"))
+    except youtube_dl.utils.UnsupportedError:
+        pass

 def brozzle_site(site, browser):
    start = time.time()
@ -78,8 +120,7 @@ def brozzle_site(site, browser):
                try:
                    crawl_url = next_url(site)
                    logging.info("crawling {}".format(crawl_url))
-                    if ydl_suitable(crawl_url.url): 
-                        logging.info("youtube-dl suitable for {}".format(crawl_url))
+                    try_youtube_dl(site, crawl_url)
                    crawl_url.outlinks = browser.browse_page(crawl_url.url)
                    completed_url(site, crawl_url)
                except kombu.simple.Empty: