From 1445aa997678870606ae0b2f52b99577cd1335c6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 5 May 2016 23:23:52 +0000 Subject: [PATCH] make Site.warcprox_meta a special thing, replacing Site.extra_headers; this way, warcprox_meta is a dictionary in rethinkdb rather than a long json string --- bin/brozzle-page | 14 +++----------- bin/brozzler-new-site | 17 +++++++---------- brozzler/job.py | 8 +------- brozzler/robots.py | 4 ++-- brozzler/site.py | 17 +++++++++-------- brozzler/worker.py | 14 +++++++------- 6 files changed, 29 insertions(+), 45 deletions(-) diff --git a/bin/brozzle-page b/bin/brozzle-page index e5e89bb..27e1a35 100755 --- a/bin/brozzle-page +++ b/bin/brozzle-page @@ -36,8 +36,6 @@ arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', help='executable to use to invoke chrome') arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site") -arg_parser.add_argument("-H", "--extra-header", action="append", - dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)") arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help='enable special features for this site that assume the configured proxy is warcprox') arg_parser.add_argument("-v", "--verbose", dest="log_level", @@ -52,15 +50,9 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) -extra_headers = {} -if args.extra_headers: - for hh in args.extra_headers: - [k,v] = re.split(r":\s*", hh, 1) - extra_headers[k] = v - -site = brozzler.Site(id=-1, seed=args.url, proxy=args.proxy, - enable_warcprox_features=args.enable_warcprox_features, - extra_headers=extra_headers) +site = brozzler.Site( + id=-1, seed=args.url, proxy=args.proxy, + enable_warcprox_features=args.enable_warcprox_features) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) ydl = worker._youtube_dl(site) diff --git a/bin/brozzler-new-site b/bin/brozzler-new-site index 511ab51..852a316 100755 --- a/bin/brozzler-new-site +++ b/bin/brozzler-new-site @@ -38,12 +38,14 @@ arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler help='rethinkdb database name') arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site") arg_parser.add_argument("--time-limit", dest="time_limit", default=None, help="time limit in seconds for this site") -arg_parser.add_argument("-H", "--extra-header", action="append", - dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)") arg_parser.add_argument("--ignore-robots", dest="ignore_robots", action="store_true", help="ignore robots.txt for this site") arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help='enable special features for this site that assume the configured proxy is warcprox') +arg_parser.add_argument( + '--warcprox-meta', dest='warcprox_meta', + help='Warcprox-Meta http request header to send with each request; ' + 'must be a json blob, ignored unless warcprox features are enabled') arg_parser.add_argument("-v", "--verbose", dest="log_level", action="store_const", default=logging.INFO, const=logging.DEBUG) arg_parser.add_argument("--version", action="version", @@ -56,17 +58,12 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) -extra_headers = {} -if args.extra_headers: - for hh in args.extra_headers: - [k,v] = re.split(r":\s*", hh, 1) - extra_headers[k] = v - -site = brozzler.Site(seed=args.seed, proxy=args.proxy, +site = brozzler.Site( + seed=args.seed, proxy=args.proxy, time_limit=int(args.time_limit) if args.time_limit else None, ignore_robots=args.ignore_robots, enable_warcprox_features=args.enable_warcprox_features, - extra_headers=extra_headers) + warcprox_meta=json.loads(args.warcprox_meta)) r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) diff --git a/brozzler/job.py b/brozzler/job.py index 625e455..fb3d720 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -54,12 +54,6 @@ def new_job(frontier, job_conf): merged_conf = merge(seed_conf, job_conf) # XXX check for unknown settings, invalid url, etc - extra_headers = None - if "warcprox_meta" in merged_conf: - warcprox_meta = json.dumps( - merged_conf["warcprox_meta"], separators=(',', ':')) - extra_headers = {"Warcprox-Meta":warcprox_meta} - site = brozzler.Site(job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), @@ -68,7 +62,7 @@ def new_job(frontier, job_conf): ignore_robots=merged_conf.get("ignore_robots"), enable_warcprox_features=merged_conf.get( "enable_warcprox_features"), - extra_headers=extra_headers, + warcprox_meta=merged_conf.get("warcprox_meta"), metadata=merged_conf.get("metadata")) sites.append(site) diff --git a/brozzler/robots.py b/brozzler/robots.py index 131fdcc..1cd7e87 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -40,8 +40,8 @@ def _robots_cache(site): if site.proxy: proxie = "http://{}".format(site.proxy) req_sesh.proxies = {"http":proxie,"https":proxie} - if site.extra_headers: - req_sesh.headers.update(site.extra_headers) + if site.extra_headers(): + req_sesh.headers.update(site.extra_headers()) _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh) return _robots_caches[site.id] diff --git a/brozzler/site.py b/brozzler/site.py index d774335..96831d4 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -87,7 +87,7 @@ class Site(brozzler.BaseDictable): def __init__( self, seed, id=None, job_id=None, scope=None, proxy=None, - ignore_robots=False, time_limit=None, extra_headers=None, + ignore_robots=False, time_limit=None, warcprox_meta=None, enable_warcprox_features=False, reached_limit=None, status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, @@ -99,7 +99,7 @@ class Site(brozzler.BaseDictable): self.proxy = proxy self.ignore_robots = ignore_robots self.enable_warcprox_features = bool(enable_warcprox_features) - self.extra_headers = extra_headers + self.warcprox_meta = warcprox_meta self.time_limit = time_limit self.reached_limit = reached_limit self.status = status @@ -114,12 +114,6 @@ class Site(brozzler.BaseDictable): if not "surt" in self.scope: self.scope["surt"] = Url(seed).surt - def __repr__(self): - return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format( - self.id, repr(self.seed), repr(self.scope), - repr(self.proxy), self.enable_warcprox_features, - self.ignore_robots, self.extra_headers, self.reached_limit) - def __str__(self): return "Site-%s-%s" % (self.id, self.seed) @@ -130,6 +124,13 @@ class Site(brozzler.BaseDictable): self.scope["surt"], new_scope_surt)) self.scope["surt"] = new_scope_surt + def extra_headers(self): + hdrs = {} + if self.enable_warcprox_features and self.warcprox_meta: + hdrs["Warcprox-Meta"] = json.dumps( + self.warcprox_meta, separators=(',', ':')) + return hdrs + def is_in_scope(self, url, parent_page=None): if not isinstance(url, Url): u = Url(url) diff --git a/brozzler/worker.py b/brozzler/worker.py index 1ec3bb9..f30705b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -120,8 +120,8 @@ class BrozzlerWorker: ## # see https://github.com/rg3/youtube-dl/issues/6087 ## os.environ["http_proxy"] = "http://{}".format(site.proxy) ydl = youtube_dl.YoutubeDL(ydl_opts) - if site.extra_headers: - ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers)) + if site.extra_headers(): + ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers())) ydl.brozzler_spy = YoutubeDLSpy() ydl._opener.add_handler(ydl.brozzler_spy) return ydl @@ -167,7 +167,7 @@ class BrozzlerWorker: url="youtube-dl:%s" % page.url, warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), - extra_headers=site.extra_headers) + extra_headers=site.extra_headers()) except BaseException as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: pass @@ -207,12 +207,12 @@ class BrozzlerWorker: url="screenshot:{}".format(page.url), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, - extra_headers=site.extra_headers) + extra_headers=site.extra_headers()) self._warcprox_write_record(warcprox_address=site.proxy, url="thumbnail:{}".format(page.url), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, - extra_headers=site.extra_headers) + extra_headers=site.extra_headers()) self.logger.info("brozzling {}".format(page)) ydl.brozzler_spy.reset() @@ -229,7 +229,7 @@ class BrozzlerWorker: if not browser.is_running(): browser.start(proxy=site.proxy) outlinks = browser.browse_page( - page.url, extra_headers=site.extra_headers, + page.url, extra_headers=site.extra_headers(), on_screenshot=_on_screenshot, on_url_change=page.note_redirect) return outlinks @@ -252,7 +252,7 @@ class BrozzlerWorker: self.logger.info('fetching %s', page) # response is ignored requests.get( - page.url, proxies=proxies, headers=site.extra_headers, + page.url, proxies=proxies, headers=site.extra_headers(), verify=False) def _needs_browsing(self, page, brozzler_spy):