Merge pull request #18 from ato/user-agent

Add user_agent option
This commit is contained in:
Noah Levitt 2016-10-04 14:39:50 -07:00 committed by GitHub
commit f5e138c7eb
7 changed files with 28 additions and 2 deletions

View File

@ -229,6 +229,7 @@ class Browser:
def browse_page(
self, url, extra_headers=None, behavior_parameters=None,
user_agent=None,
on_request=None, on_response=None, on_screenshot=None,
on_url_change=None):
"""
@ -243,6 +244,7 @@ class Browser:
raise BrowsingException("browser has not been started")
self.url = url
self.extra_headers = extra_headers
self.user_agent = user_agent
self.on_request = on_request
self.on_screenshot = on_screenshot
self.on_url_change = on_url_change
@ -460,6 +462,9 @@ __brzl_compileOutlinks(window).join(' ');
if self.extra_headers:
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
if self.user_agent:
self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent})
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

View File

@ -85,7 +85,8 @@ def new_job(frontier, job_conf):
"enable_warcprox_features"),
warcprox_meta=merged_conf.get("warcprox_meta"),
metadata=merged_conf.get("metadata"),
remember_outlinks=merged_conf.get("remember_outlinks"))
remember_outlinks=merged_conf.get("remember_outlinks"),
user_agent=merged_conf.get("user_agent"))
sites.append(site)
# insert all the sites into database before the job

View File

@ -64,6 +64,9 @@ id:
metadata:
type: dict
user_agent:
type: string
seeds:
type: list
required: true

View File

@ -42,6 +42,8 @@ def _robots_cache(site):
req_sesh.proxies = {"http":proxie,"https":proxie}
if site.extra_headers():
req_sesh.headers.update(site.extra_headers())
if site.user_agent:
req_sesh.headers['User-Agent'] = site.user_agent
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
return _robots_caches[site.id]

View File

@ -91,7 +91,8 @@ class Site(brozzler.BaseDictable):
enable_warcprox_features=False, reached_limit=None,
status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None):
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
cookie_db=None, user_agent=None):
self.seed = seed
self.id = id
@ -111,6 +112,7 @@ class Site(brozzler.BaseDictable):
self.metadata = metadata
self.remember_outlinks = remember_outlinks
self.cookie_db = cookie_db
self.user_agent = user_agent
self.scope = scope or {}
if not "surt" in self.scope:

View File

@ -260,6 +260,7 @@ class BrozzlerWorker:
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers(),
user_agent=site.user_agent,
on_screenshot=_on_screenshot,
on_url_change=page.note_redirect)
return outlinks

View File

@ -168,6 +168,18 @@ ignore_robots
If set to ``true``, brozzler will happily crawl pages that would otherwise be
blocked by robots.txt rules.
user_agent
----------
+-----------------------+---------+----------+---------+
| scope | type | required | default |
+=======================+=========+==========+=========+
| seed-level, top-level | string | no | *none* |
+-----------------------+---------+----------+---------+
The ``User-Agent`` header brozzler will send to identify itself to web servers.
It's good ettiquette to include a project URL with a notice to webmasters that
explains why you're crawling, how to block the crawler robots.txt and how to
contact the operator if the crawl is causing problems.
warcprox_meta
-------------
+-----------------------+------------+----------+---------+