make Site.warcprox_meta a special thing, replacing Site.extra_headers; this way, warcprox_meta is a dictionary in rethinkdb rather than a long json string

This commit is contained in:
Noah Levitt 2016-05-05 23:23:52 +00:00
parent 07e15e26bd
commit 1445aa9976
6 changed files with 29 additions and 45 deletions

View File

@ -36,8 +36,6 @@ arg_parser.add_argument('url', metavar='URL', help='page url')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
arg_parser.add_argument("-H", "--extra-header", action="append",
dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)")
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
arg_parser.add_argument("-v", "--verbose", dest="log_level",
@ -52,15 +50,9 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
extra_headers = {}
if args.extra_headers:
for hh in args.extra_headers:
[k,v] = re.split(r":\s*", hh, 1)
extra_headers[k] = v
site = brozzler.Site(id=-1, seed=args.url, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features,
extra_headers=extra_headers)
site = brozzler.Site(
id=-1, seed=args.url, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features)
page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None)
ydl = worker._youtube_dl(site)

View File

@ -38,12 +38,14 @@ arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler
help='rethinkdb database name')
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
arg_parser.add_argument("--time-limit", dest="time_limit", default=None, help="time limit in seconds for this site")
arg_parser.add_argument("-H", "--extra-header", action="append",
dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)")
arg_parser.add_argument("--ignore-robots", dest="ignore_robots",
action="store_true", help="ignore robots.txt for this site")
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
arg_parser.add_argument(
'--warcprox-meta', dest='warcprox_meta',
help='Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are enabled')
arg_parser.add_argument("-v", "--verbose", dest="log_level",
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument("--version", action="version",
@ -56,17 +58,12 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
extra_headers = {}
if args.extra_headers:
for hh in args.extra_headers:
[k,v] = re.split(r":\s*", hh, 1)
extra_headers[k] = v
site = brozzler.Site(seed=args.seed, proxy=args.proxy,
site = brozzler.Site(
seed=args.seed, proxy=args.proxy,
time_limit=int(args.time_limit) if args.time_limit else None,
ignore_robots=args.ignore_robots,
enable_warcprox_features=args.enable_warcprox_features,
extra_headers=extra_headers)
warcprox_meta=json.loads(args.warcprox_meta))
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)

View File

@ -54,12 +54,6 @@ def new_job(frontier, job_conf):
merged_conf = merge(seed_conf, job_conf)
# XXX check for unknown settings, invalid url, etc
extra_headers = None
if "warcprox_meta" in merged_conf:
warcprox_meta = json.dumps(
merged_conf["warcprox_meta"], separators=(',', ':'))
extra_headers = {"Warcprox-Meta":warcprox_meta}
site = brozzler.Site(job_id=job.id,
seed=merged_conf["url"],
scope=merged_conf.get("scope"),
@ -68,7 +62,7 @@ def new_job(frontier, job_conf):
ignore_robots=merged_conf.get("ignore_robots"),
enable_warcprox_features=merged_conf.get(
"enable_warcprox_features"),
extra_headers=extra_headers,
warcprox_meta=merged_conf.get("warcprox_meta"),
metadata=merged_conf.get("metadata"))
sites.append(site)

View File

@ -40,8 +40,8 @@ def _robots_cache(site):
if site.proxy:
proxie = "http://{}".format(site.proxy)
req_sesh.proxies = {"http":proxie,"https":proxie}
if site.extra_headers:
req_sesh.headers.update(site.extra_headers)
if site.extra_headers():
req_sesh.headers.update(site.extra_headers())
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
return _robots_caches[site.id]

View File

@ -87,7 +87,7 @@ class Site(brozzler.BaseDictable):
def __init__(
self, seed, id=None, job_id=None, scope=None, proxy=None,
ignore_robots=False, time_limit=None, extra_headers=None,
ignore_robots=False, time_limit=None, warcprox_meta=None,
enable_warcprox_features=False, reached_limit=None,
status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
@ -99,7 +99,7 @@ class Site(brozzler.BaseDictable):
self.proxy = proxy
self.ignore_robots = ignore_robots
self.enable_warcprox_features = bool(enable_warcprox_features)
self.extra_headers = extra_headers
self.warcprox_meta = warcprox_meta
self.time_limit = time_limit
self.reached_limit = reached_limit
self.status = status
@ -114,12 +114,6 @@ class Site(brozzler.BaseDictable):
if not "surt" in self.scope:
self.scope["surt"] = Url(seed).surt
def __repr__(self):
return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
self.id, repr(self.seed), repr(self.scope),
repr(self.proxy), self.enable_warcprox_features,
self.ignore_robots, self.extra_headers, self.reached_limit)
def __str__(self):
return "Site-%s-%s" % (self.id, self.seed)
@ -130,6 +124,13 @@ class Site(brozzler.BaseDictable):
self.scope["surt"], new_scope_surt))
self.scope["surt"] = new_scope_surt
def extra_headers(self):
hdrs = {}
if self.enable_warcprox_features and self.warcprox_meta:
hdrs["Warcprox-Meta"] = json.dumps(
self.warcprox_meta, separators=(',', ':'))
return hdrs
def is_in_scope(self, url, parent_page=None):
if not isinstance(url, Url):
u = Url(url)

View File

@ -120,8 +120,8 @@ class BrozzlerWorker:
## # see https://github.com/rg3/youtube-dl/issues/6087
## os.environ["http_proxy"] = "http://{}".format(site.proxy)
ydl = youtube_dl.YoutubeDL(ydl_opts)
if site.extra_headers:
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers))
if site.extra_headers():
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
ydl.brozzler_spy = YoutubeDLSpy()
ydl._opener.add_handler(ydl.brozzler_spy)
return ydl
@ -167,7 +167,7 @@ class BrozzlerWorker:
url="youtube-dl:%s" % page.url, warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers)
extra_headers=site.extra_headers())
except BaseException as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass
@ -207,12 +207,12 @@ class BrozzlerWorker:
url="screenshot:{}".format(page.url),
warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg,
extra_headers=site.extra_headers)
extra_headers=site.extra_headers())
self._warcprox_write_record(warcprox_address=site.proxy,
url="thumbnail:{}".format(page.url),
warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg,
extra_headers=site.extra_headers)
extra_headers=site.extra_headers())
self.logger.info("brozzling {}".format(page))
ydl.brozzler_spy.reset()
@ -229,7 +229,7 @@ class BrozzlerWorker:
if not browser.is_running():
browser.start(proxy=site.proxy)
outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers,
page.url, extra_headers=site.extra_headers(),
on_screenshot=_on_screenshot,
on_url_change=page.note_redirect)
return outlinks
@ -252,7 +252,7 @@ class BrozzlerWorker:
self.logger.info('fetching %s', page)
# response is ignored
requests.get(
page.url, proxies=proxies, headers=site.extra_headers,
page.url, proxies=proxies, headers=site.extra_headers(),
verify=False)
def _needs_browsing(self, page, brozzler_spy):