mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 16:19:49 -05:00
Add user_agent option
Currently doesn't apply to requests made by youtube-dl as I couldn't see a thread-safe way of doing that.
This commit is contained in:
parent
5ac67fe513
commit
743b5a4347
@ -229,6 +229,7 @@ class Browser:
|
||||
|
||||
def browse_page(
|
||||
self, url, extra_headers=None, behavior_parameters=None,
|
||||
user_agent=None,
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
on_url_change=None):
|
||||
"""
|
||||
@ -243,6 +244,7 @@ class Browser:
|
||||
raise BrowsingException("browser has not been started")
|
||||
self.url = url
|
||||
self.extra_headers = extra_headers
|
||||
self.user_agent = user_agent
|
||||
self.on_request = on_request
|
||||
self.on_screenshot = on_screenshot
|
||||
self.on_url_change = on_url_change
|
||||
@ -460,6 +462,9 @@ __brzl_compileOutlinks(window).join(' ');
|
||||
if self.extra_headers:
|
||||
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
|
||||
|
||||
if self.user_agent:
|
||||
self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent})
|
||||
|
||||
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
||||
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
||||
|
||||
|
@ -85,7 +85,8 @@ def new_job(frontier, job_conf):
|
||||
"enable_warcprox_features"),
|
||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||
metadata=merged_conf.get("metadata"),
|
||||
remember_outlinks=merged_conf.get("remember_outlinks"))
|
||||
remember_outlinks=merged_conf.get("remember_outlinks"),
|
||||
user_agent=merged_conf.get("user_agent"))
|
||||
sites.append(site)
|
||||
|
||||
# insert all the sites into database before the job
|
||||
|
@ -64,6 +64,9 @@ id:
|
||||
metadata:
|
||||
type: dict
|
||||
|
||||
user_agent:
|
||||
type: string
|
||||
|
||||
seeds:
|
||||
type: list
|
||||
schema:
|
||||
|
@ -42,6 +42,8 @@ def _robots_cache(site):
|
||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||
if site.extra_headers():
|
||||
req_sesh.headers.update(site.extra_headers())
|
||||
if site.user_agent:
|
||||
req_sesh.headers['User-Agent'] = site.user_agent
|
||||
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||
|
||||
return _robots_caches[site.id]
|
||||
|
@ -91,7 +91,8 @@ class Site(brozzler.BaseDictable):
|
||||
enable_warcprox_features=False, reached_limit=None,
|
||||
status="ACTIVE", claimed=False, start_time=None,
|
||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None):
|
||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
||||
cookie_db=None, user_agent=None):
|
||||
|
||||
self.seed = seed
|
||||
self.id = id
|
||||
@ -111,6 +112,7 @@ class Site(brozzler.BaseDictable):
|
||||
self.metadata = metadata
|
||||
self.remember_outlinks = remember_outlinks
|
||||
self.cookie_db = cookie_db
|
||||
self.user_agent = user_agent
|
||||
|
||||
self.scope = scope or {}
|
||||
if not "surt" in self.scope:
|
||||
|
@ -260,6 +260,7 @@ class BrozzlerWorker:
|
||||
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
||||
outlinks = browser.browse_page(
|
||||
page.url, extra_headers=site.extra_headers(),
|
||||
user_agent=site.user_agent,
|
||||
on_screenshot=_on_screenshot,
|
||||
on_url_change=page.note_redirect)
|
||||
return outlinks
|
||||
|
12
job-conf.rst
12
job-conf.rst
@ -168,6 +168,18 @@ ignore_robots
|
||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||
blocked by robots.txt rules.
|
||||
|
||||
user_agent
|
||||
----------
|
||||
+-----------------------+---------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+=========+
|
||||
| seed-level, top-level | string | no | *none* |
|
||||
+-----------------------+---------+----------+---------+
|
||||
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||
It's good ettiquette to include a project URL with a notice to webmasters that
|
||||
explains why you're crawling, how to block the crawler robots.txt and how to
|
||||
contact the operator if the crawl is causing problems.
|
||||
|
||||
warcprox_meta
|
||||
-------------
|
||||
+-----------------------+------------+----------+---------+
|
||||
|
Loading…
x
Reference in New Issue
Block a user