make putmeta requests respect site configured extra_headers

This commit is contained in:
Noah Levitt 2015-07-17 16:52:06 -07:00
parent 2ba5bd4d4b
commit 2f28f00a09
2 changed files with 15 additions and 7 deletions

View File

@ -31,11 +31,13 @@ class Site:
if proxy: if proxy:
proxie = "http://{}".format(proxy) proxie = "http://{}".format(proxy)
req_sesh.proxies = {"http":proxie,"https":proxie} req_sesh.proxies = {"http":proxie,"https":proxie}
if extra_headers:
req_sesh.headers.update(extra_headers)
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh) self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
def __repr__(self): def __repr__(self):
return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={})""".format( return """Site(seed="{}",scope_surt="{}",proxy="{}",enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots) self.seed, self.scope_surt, self.proxy, self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
def note_seed_redirect(self, url): def note_seed_redirect(self, url):
new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) new_scope_surt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)

View File

@ -34,6 +34,7 @@ class BrozzlerWorker:
"noprogress": True, "noprogress": True,
"nopart": True, "nopart": True,
"no_color": True, "no_color": True,
"http_headers": site.extra_headers,
} }
if site.proxy: if site.proxy:
ydl_opts["proxy"] = "http://{}".format(site.proxy) ydl_opts["proxy"] = "http://{}".format(site.proxy)
@ -70,9 +71,12 @@ class BrozzlerWorker:
self.logger.info("putting unfinished page {} on queue {}".format(page, q.queue.name)) self.logger.info("putting unfinished page {} on queue {}".format(page, q.queue.name))
q.put(page.to_dict()) q.put(page.to_dict())
def _putmeta(self, warcprox_address, url, content_type, payload): def _putmeta(self, warcprox_address, url, content_type, payload, extra_headers=None):
headers = {"Content-Type":content_type}
if extra_headers:
headers.update(extra_headers)
request = urllib.request.Request(url, method="PUTMETA", request = urllib.request.Request(url, method="PUTMETA",
headers={"Content-Type":content_type}, data=payload) headers=headers, data=payload)
# XXX setting request.type="http" is a hack to stop urllib from trying # XXX setting request.type="http" is a hack to stop urllib from trying
# to tunnel if url is https # to tunnel if url is https
@ -95,7 +99,8 @@ class BrozzlerWorker:
self.logger.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(page)) self.logger.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(page))
self._putmeta(warcprox_address=site.proxy, url=page.url, self._putmeta(warcprox_address=site.proxy, url=page.url,
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8")) payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers)
except BaseException as e: except BaseException as e:
if youtube_dl.utils.UnsupportedError in e.exc_info: if youtube_dl.utils.UnsupportedError in e.exc_info:
pass pass
@ -107,7 +112,8 @@ class BrozzlerWorker:
if site.proxy and site.enable_warcprox_features: if site.proxy and site.enable_warcprox_features:
self.logger.info("sending PUTMETA request to warcprox with screenshot for {}".format(page)) self.logger.info("sending PUTMETA request to warcprox with screenshot for {}".format(page))
self._putmeta(warcprox_address=site.proxy, url=page.url, self._putmeta(warcprox_address=site.proxy, url=page.url,
content_type="image/png", payload=screenshot_png) content_type="image/png", payload=screenshot_png,
extra_headers=site.extra_headers)
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
self._try_youtube_dl(ydl, site, page) self._try_youtube_dl(ydl, site, page)
@ -153,7 +159,7 @@ class BrozzlerWorker:
msg = q.get(block=True, timeout=0.5) msg = q.get(block=True, timeout=0.5)
site = brozzler.Site(**msg.payload) site = brozzler.Site(**msg.payload)
msg.ack() # XXX ack only after browsing finished? kinda complicated msg.ack() # XXX ack only after browsing finished? kinda complicated
self.logger.info("browsing site {}".format(site)) self.logger.info("brozzling site {}".format(site))
ydl = self._youtube_dl(site) ydl = self._youtube_dl(site)
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site), th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
name="BrowsingThread-{}".format(site.scope_surt)) name="BrowsingThread-{}".format(site.scope_surt))