mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
make Site.warcprox_meta a special thing, replacing Site.extra_headers; this way, warcprox_meta is a dictionary in rethinkdb rather than a long json string
This commit is contained in:
parent
07e15e26bd
commit
1445aa9976
@ -36,8 +36,6 @@ arg_parser.add_argument('url', metavar='URL', help='page url')
|
||||
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
||||
help='executable to use to invoke chrome')
|
||||
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
|
||||
arg_parser.add_argument("-H", "--extra-header", action="append",
|
||||
dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)")
|
||||
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
|
||||
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
|
||||
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
||||
@ -52,15 +50,9 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||
|
||||
extra_headers = {}
|
||||
if args.extra_headers:
|
||||
for hh in args.extra_headers:
|
||||
[k,v] = re.split(r":\s*", hh, 1)
|
||||
extra_headers[k] = v
|
||||
|
||||
site = brozzler.Site(id=-1, seed=args.url, proxy=args.proxy,
|
||||
enable_warcprox_features=args.enable_warcprox_features,
|
||||
extra_headers=extra_headers)
|
||||
site = brozzler.Site(
|
||||
id=-1, seed=args.url, proxy=args.proxy,
|
||||
enable_warcprox_features=args.enable_warcprox_features)
|
||||
page = brozzler.Page(url=args.url, site_id=site.id)
|
||||
worker = brozzler.BrozzlerWorker(frontier=None)
|
||||
ydl = worker._youtube_dl(site)
|
||||
|
@ -38,12 +38,14 @@ arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler
|
||||
help='rethinkdb database name')
|
||||
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
|
||||
arg_parser.add_argument("--time-limit", dest="time_limit", default=None, help="time limit in seconds for this site")
|
||||
arg_parser.add_argument("-H", "--extra-header", action="append",
|
||||
dest="extra_headers", default=None, help="extra http header to send with every request for this site (may be used multiple times)")
|
||||
arg_parser.add_argument("--ignore-robots", dest="ignore_robots",
|
||||
action="store_true", help="ignore robots.txt for this site")
|
||||
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
|
||||
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
|
||||
arg_parser.add_argument(
|
||||
'--warcprox-meta', dest='warcprox_meta',
|
||||
help='Warcprox-Meta http request header to send with each request; '
|
||||
'must be a json blob, ignored unless warcprox features are enabled')
|
||||
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||
arg_parser.add_argument("--version", action="version",
|
||||
@ -56,17 +58,12 @@ logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||
|
||||
extra_headers = {}
|
||||
if args.extra_headers:
|
||||
for hh in args.extra_headers:
|
||||
[k,v] = re.split(r":\s*", hh, 1)
|
||||
extra_headers[k] = v
|
||||
|
||||
site = brozzler.Site(seed=args.seed, proxy=args.proxy,
|
||||
site = brozzler.Site(
|
||||
seed=args.seed, proxy=args.proxy,
|
||||
time_limit=int(args.time_limit) if args.time_limit else None,
|
||||
ignore_robots=args.ignore_robots,
|
||||
enable_warcprox_features=args.enable_warcprox_features,
|
||||
extra_headers=extra_headers)
|
||||
warcprox_meta=json.loads(args.warcprox_meta))
|
||||
|
||||
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
|
@ -54,12 +54,6 @@ def new_job(frontier, job_conf):
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
# XXX check for unknown settings, invalid url, etc
|
||||
|
||||
extra_headers = None
|
||||
if "warcprox_meta" in merged_conf:
|
||||
warcprox_meta = json.dumps(
|
||||
merged_conf["warcprox_meta"], separators=(',', ':'))
|
||||
extra_headers = {"Warcprox-Meta":warcprox_meta}
|
||||
|
||||
site = brozzler.Site(job_id=job.id,
|
||||
seed=merged_conf["url"],
|
||||
scope=merged_conf.get("scope"),
|
||||
@ -68,7 +62,7 @@ def new_job(frontier, job_conf):
|
||||
ignore_robots=merged_conf.get("ignore_robots"),
|
||||
enable_warcprox_features=merged_conf.get(
|
||||
"enable_warcprox_features"),
|
||||
extra_headers=extra_headers,
|
||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||
metadata=merged_conf.get("metadata"))
|
||||
sites.append(site)
|
||||
|
||||
|
@ -40,8 +40,8 @@ def _robots_cache(site):
|
||||
if site.proxy:
|
||||
proxie = "http://{}".format(site.proxy)
|
||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||
if site.extra_headers:
|
||||
req_sesh.headers.update(site.extra_headers)
|
||||
if site.extra_headers():
|
||||
req_sesh.headers.update(site.extra_headers())
|
||||
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||
|
||||
return _robots_caches[site.id]
|
||||
|
@ -87,7 +87,7 @@ class Site(brozzler.BaseDictable):
|
||||
|
||||
def __init__(
|
||||
self, seed, id=None, job_id=None, scope=None, proxy=None,
|
||||
ignore_robots=False, time_limit=None, extra_headers=None,
|
||||
ignore_robots=False, time_limit=None, warcprox_meta=None,
|
||||
enable_warcprox_features=False, reached_limit=None,
|
||||
status="ACTIVE", claimed=False, start_time=None,
|
||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||
@ -99,7 +99,7 @@ class Site(brozzler.BaseDictable):
|
||||
self.proxy = proxy
|
||||
self.ignore_robots = ignore_robots
|
||||
self.enable_warcprox_features = bool(enable_warcprox_features)
|
||||
self.extra_headers = extra_headers
|
||||
self.warcprox_meta = warcprox_meta
|
||||
self.time_limit = time_limit
|
||||
self.reached_limit = reached_limit
|
||||
self.status = status
|
||||
@ -114,12 +114,6 @@ class Site(brozzler.BaseDictable):
|
||||
if not "surt" in self.scope:
|
||||
self.scope["surt"] = Url(seed).surt
|
||||
|
||||
def __repr__(self):
|
||||
return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
|
||||
self.id, repr(self.seed), repr(self.scope),
|
||||
repr(self.proxy), self.enable_warcprox_features,
|
||||
self.ignore_robots, self.extra_headers, self.reached_limit)
|
||||
|
||||
def __str__(self):
|
||||
return "Site-%s-%s" % (self.id, self.seed)
|
||||
|
||||
@ -130,6 +124,13 @@ class Site(brozzler.BaseDictable):
|
||||
self.scope["surt"], new_scope_surt))
|
||||
self.scope["surt"] = new_scope_surt
|
||||
|
||||
def extra_headers(self):
|
||||
hdrs = {}
|
||||
if self.enable_warcprox_features and self.warcprox_meta:
|
||||
hdrs["Warcprox-Meta"] = json.dumps(
|
||||
self.warcprox_meta, separators=(',', ':'))
|
||||
return hdrs
|
||||
|
||||
def is_in_scope(self, url, parent_page=None):
|
||||
if not isinstance(url, Url):
|
||||
u = Url(url)
|
||||
|
@ -120,8 +120,8 @@ class BrozzlerWorker:
|
||||
## # see https://github.com/rg3/youtube-dl/issues/6087
|
||||
## os.environ["http_proxy"] = "http://{}".format(site.proxy)
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
if site.extra_headers:
|
||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers))
|
||||
if site.extra_headers():
|
||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
||||
ydl.brozzler_spy = YoutubeDLSpy()
|
||||
ydl._opener.add_handler(ydl.brozzler_spy)
|
||||
return ydl
|
||||
@ -167,7 +167,7 @@ class BrozzlerWorker:
|
||||
url="youtube-dl:%s" % page.url, warc_type="metadata",
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers)
|
||||
extra_headers=site.extra_headers())
|
||||
except BaseException as e:
|
||||
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
||||
pass
|
||||
@ -207,12 +207,12 @@ class BrozzlerWorker:
|
||||
url="screenshot:{}".format(page.url),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=screenshot_jpeg,
|
||||
extra_headers=site.extra_headers)
|
||||
extra_headers=site.extra_headers())
|
||||
self._warcprox_write_record(warcprox_address=site.proxy,
|
||||
url="thumbnail:{}".format(page.url),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers)
|
||||
extra_headers=site.extra_headers())
|
||||
|
||||
self.logger.info("brozzling {}".format(page))
|
||||
ydl.brozzler_spy.reset()
|
||||
@ -229,7 +229,7 @@ class BrozzlerWorker:
|
||||
if not browser.is_running():
|
||||
browser.start(proxy=site.proxy)
|
||||
outlinks = browser.browse_page(
|
||||
page.url, extra_headers=site.extra_headers,
|
||||
page.url, extra_headers=site.extra_headers(),
|
||||
on_screenshot=_on_screenshot,
|
||||
on_url_change=page.note_redirect)
|
||||
return outlinks
|
||||
@ -252,7 +252,7 @@ class BrozzlerWorker:
|
||||
self.logger.info('fetching %s', page)
|
||||
# response is ignored
|
||||
requests.get(
|
||||
page.url, proxies=proxies, headers=site.extra_headers,
|
||||
page.url, proxies=proxies, headers=site.extra_headers(),
|
||||
verify=False)
|
||||
|
||||
def _needs_browsing(self, page, brozzler_spy):
|
||||
|
Loading…
x
Reference in New Issue
Block a user