mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
commit
f5e138c7eb
@ -229,6 +229,7 @@ class Browser:
|
|||||||
|
|
||||||
def browse_page(
|
def browse_page(
|
||||||
self, url, extra_headers=None, behavior_parameters=None,
|
self, url, extra_headers=None, behavior_parameters=None,
|
||||||
|
user_agent=None,
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
on_url_change=None):
|
on_url_change=None):
|
||||||
"""
|
"""
|
||||||
@ -243,6 +244,7 @@ class Browser:
|
|||||||
raise BrowsingException("browser has not been started")
|
raise BrowsingException("browser has not been started")
|
||||||
self.url = url
|
self.url = url
|
||||||
self.extra_headers = extra_headers
|
self.extra_headers = extra_headers
|
||||||
|
self.user_agent = user_agent
|
||||||
self.on_request = on_request
|
self.on_request = on_request
|
||||||
self.on_screenshot = on_screenshot
|
self.on_screenshot = on_screenshot
|
||||||
self.on_url_change = on_url_change
|
self.on_url_change = on_url_change
|
||||||
@ -460,6 +462,9 @@ __brzl_compileOutlinks(window).join(' ');
|
|||||||
if self.extra_headers:
|
if self.extra_headers:
|
||||||
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
|
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
|
||||||
|
|
||||||
|
if self.user_agent:
|
||||||
|
self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent})
|
||||||
|
|
||||||
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
||||||
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
||||||
|
|
||||||
|
@ -85,7 +85,8 @@ def new_job(frontier, job_conf):
|
|||||||
"enable_warcprox_features"),
|
"enable_warcprox_features"),
|
||||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||||
metadata=merged_conf.get("metadata"),
|
metadata=merged_conf.get("metadata"),
|
||||||
remember_outlinks=merged_conf.get("remember_outlinks"))
|
remember_outlinks=merged_conf.get("remember_outlinks"),
|
||||||
|
user_agent=merged_conf.get("user_agent"))
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
|
|
||||||
# insert all the sites into database before the job
|
# insert all the sites into database before the job
|
||||||
|
@ -64,6 +64,9 @@ id:
|
|||||||
metadata:
|
metadata:
|
||||||
type: dict
|
type: dict
|
||||||
|
|
||||||
|
user_agent:
|
||||||
|
type: string
|
||||||
|
|
||||||
seeds:
|
seeds:
|
||||||
type: list
|
type: list
|
||||||
required: true
|
required: true
|
||||||
|
@ -42,6 +42,8 @@ def _robots_cache(site):
|
|||||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||||
if site.extra_headers():
|
if site.extra_headers():
|
||||||
req_sesh.headers.update(site.extra_headers())
|
req_sesh.headers.update(site.extra_headers())
|
||||||
|
if site.user_agent:
|
||||||
|
req_sesh.headers['User-Agent'] = site.user_agent
|
||||||
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||||
|
|
||||||
return _robots_caches[site.id]
|
return _robots_caches[site.id]
|
||||||
|
@ -91,7 +91,8 @@ class Site(brozzler.BaseDictable):
|
|||||||
enable_warcprox_features=False, reached_limit=None,
|
enable_warcprox_features=False, reached_limit=None,
|
||||||
status="ACTIVE", claimed=False, start_time=None,
|
status="ACTIVE", claimed=False, start_time=None,
|
||||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None):
|
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
||||||
|
cookie_db=None, user_agent=None):
|
||||||
|
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
@ -111,6 +112,7 @@ class Site(brozzler.BaseDictable):
|
|||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
self.remember_outlinks = remember_outlinks
|
self.remember_outlinks = remember_outlinks
|
||||||
self.cookie_db = cookie_db
|
self.cookie_db = cookie_db
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
|
@ -260,6 +260,7 @@ class BrozzlerWorker:
|
|||||||
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
||||||
outlinks = browser.browse_page(
|
outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(),
|
page.url, extra_headers=site.extra_headers(),
|
||||||
|
user_agent=site.user_agent,
|
||||||
on_screenshot=_on_screenshot,
|
on_screenshot=_on_screenshot,
|
||||||
on_url_change=page.note_redirect)
|
on_url_change=page.note_redirect)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
12
job-conf.rst
12
job-conf.rst
@ -168,6 +168,18 @@ ignore_robots
|
|||||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||||
blocked by robots.txt rules.
|
blocked by robots.txt rules.
|
||||||
|
|
||||||
|
user_agent
|
||||||
|
----------
|
||||||
|
+-----------------------+---------+----------+---------+
|
||||||
|
| scope | type | required | default |
|
||||||
|
+=======================+=========+==========+=========+
|
||||||
|
| seed-level, top-level | string | no | *none* |
|
||||||
|
+-----------------------+---------+----------+---------+
|
||||||
|
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||||
|
It's good ettiquette to include a project URL with a notice to webmasters that
|
||||||
|
explains why you're crawling, how to block the crawler robots.txt and how to
|
||||||
|
contact the operator if the crawl is causing problems.
|
||||||
|
|
||||||
warcprox_meta
|
warcprox_meta
|
||||||
-------------
|
-------------
|
||||||
+-----------------------+------------+----------+---------+
|
+-----------------------+------------+----------+---------+
|
||||||
|
Loading…
x
Reference in New Issue
Block a user