diff --git a/brozzler/browser.py b/brozzler/browser.py index 17d7153..1b3301b 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -229,6 +229,7 @@ class Browser: def browse_page( self, url, extra_headers=None, behavior_parameters=None, + user_agent=None, on_request=None, on_response=None, on_screenshot=None, on_url_change=None): """ @@ -243,6 +244,7 @@ class Browser: raise BrowsingException("browser has not been started") self.url = url self.extra_headers = extra_headers + self.user_agent = user_agent self.on_request = on_request self.on_screenshot = on_screenshot self.on_url_change = on_url_change @@ -460,6 +462,9 @@ __brzl_compileOutlinks(window).join(' '); if self.extra_headers: self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers}) + if self.user_agent: + self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent}) + # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused" self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"}) diff --git a/brozzler/job.py b/brozzler/job.py index 3a7b874..66dbd5d 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -85,7 +85,8 @@ def new_job(frontier, job_conf): "enable_warcprox_features"), warcprox_meta=merged_conf.get("warcprox_meta"), metadata=merged_conf.get("metadata"), - remember_outlinks=merged_conf.get("remember_outlinks")) + remember_outlinks=merged_conf.get("remember_outlinks"), + user_agent=merged_conf.get("user_agent")) sites.append(site) # insert all the sites into database before the job diff --git a/brozzler/job_schema.yml b/brozzler/job_schema.yml index e9d2078..bcaefe3 100644 --- a/brozzler/job_schema.yml +++ b/brozzler/job_schema.yml @@ -64,6 +64,9 @@ id: metadata: type: dict + user_agent: + type: string + seeds: type: list schema: diff --git a/brozzler/robots.py b/brozzler/robots.py index 1cd7e87..b4732ae 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -42,6 +42,8 @@ def _robots_cache(site): req_sesh.proxies = {"http":proxie,"https":proxie} if site.extra_headers(): req_sesh.headers.update(site.extra_headers()) + if site.user_agent: + req_sesh.headers['User-Agent'] = site.user_agent _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh) return _robots_caches[site.id] diff --git a/brozzler/site.py b/brozzler/site.py index cb75d5e..f474d9b 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -91,7 +91,8 @@ class Site(brozzler.BaseDictable): enable_warcprox_features=False, reached_limit=None, status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, - last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None): + last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, + cookie_db=None, user_agent=None): self.seed = seed self.id = id @@ -111,6 +112,7 @@ class Site(brozzler.BaseDictable): self.metadata = metadata self.remember_outlinks = remember_outlinks self.cookie_db = cookie_db + self.user_agent = user_agent self.scope = scope or {} if not "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index 38e7ecb..1cb6aa3 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -260,6 +260,7 @@ class BrozzlerWorker: browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db) outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), + user_agent=site.user_agent, on_screenshot=_on_screenshot, on_url_change=page.note_redirect) return outlinks diff --git a/job-conf.rst b/job-conf.rst index 056c7ca..6773afd 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -168,6 +168,18 @@ ignore_robots If set to ``true``, brozzler will happily crawl pages that would otherwise be blocked by robots.txt rules. +user_agent +---------- ++-----------------------+---------+----------+---------+ +| scope | type | required | default | ++=======================+=========+==========+=========+ +| seed-level, top-level | string | no | *none* | ++-----------------------+---------+----------+---------+ +The ``User-Agent`` header brozzler will send to identify itself to web servers. +It's good ettiquette to include a project URL with a notice to webmasters that +explains why you're crawling, how to block the crawler robots.txt and how to +contact the operator if the crawl is causing problems. + warcprox_meta ------------- +-----------------------+------------+----------+---------+