From 2f28f00a095c01e2476a1737febbeffcead6ce87 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 17 Jul 2015 16:52:06 -0700 Subject: [PATCH] make putmeta requests respect site configured extra_headers --- brozzler/site.py | 6 ++++-- brozzler/worker.py | 16 +++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/brozzler/site.py b/brozzler/site.py index 13ce565..c16cf03 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -31,11 +31,13 @@ class Site: if proxy: proxie = "http://{}".format(proxy) req_sesh.proxies = {"http":proxie,"https":proxie} + if extra_headers: + req_sesh.headers.update(extra_headers) self._robots_cache = reppy.cache.RobotsCache(session=req_sesh) def __repr__(self): - return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={})""".format( - self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots) + return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={},extra_headers={})""".format( + self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots, self.extra_headers) def note_seed_redirect(self, url): new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) diff --git a/brozzler/worker.py b/brozzler/worker.py index 5ff356d..721929d 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -34,6 +34,7 @@ class BrozzlerWorker: "noprogress": True, "nopart": True, "no_color": True, + "http_headers": site.extra_headers, } if site.proxy: ydl_opts["proxy"] = "http://{}".format(site.proxy) @@ -70,9 +71,12 @@ class BrozzlerWorker: self.logger.info("putting unfinished page {} on queue {}".format(page, q.queue.name)) q.put(page.to_dict()) - def _putmeta(self, warcprox_address, url, content_type, payload): + def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None): + headers = {"Content-Type":content_type} + if extra_headers: + headers.update(extra_headers) request = urllib.request.Request(url, method="PUTMETA", - headers={"Content-Type":content_type}, data=payload) + headers=headers, data=payload) # XXX setting request.type="http" is a hack to stop urllib from trying # to tunnel if url is https @@ -95,7 +99,8 @@ class BrozzlerWorker: self.logger.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(page)) self._putmeta(warcprox_address=site.proxy, url=page.url, content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", - payload=info_json.encode("utf-8")) + payload=info_json.encode("utf-8"), + extra_headers=site.extra_headers) except BaseException as e: if youtube_dl.utils.UnsupportedError in e.exc_info: pass @@ -107,7 +112,8 @@ class BrozzlerWorker: if site.proxy and site.enable_warcprox_features: self.logger.info("sending PUTMETA request to warcprox with screenshot for {}".format(page)) self._putmeta(warcprox_address=site.proxy, url=page.url, - content_type="image/png", payload=screenshot_png) + content_type="image/png", payload=screenshot_png, + extra_headers=site.extra_headers) self.logger.info("brozzling {}".format(page)) self._try_youtube_dl(ydl, site, page) @@ -153,7 +159,7 @@ class BrozzlerWorker: msg = q.get(block=True, timeout=0.5) site = brozzler.Site(**msg.payload) msg.ack() # XXX ack only after browsing finished? kinda complicated - self.logger.info("browsing site {}".format(site)) + self.logger.info("brozzling site {}".format(site)) ydl = self._youtube_dl(site) th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site), name="BrowsingThread-{}".format(site.scope_surt))