mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-03 03:56:24 -04:00
support adding extra http request headers
This commit is contained in:
parent
c178ed1950
commit
2ba5bd4d4b
6 changed files with 21 additions and 13 deletions
|
@ -30,6 +30,8 @@ args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
|
# the way we're using it, amqp is too verbose at debug level
|
||||||
|
logging.getLogger("amqp").setLevel(logging.INFO)
|
||||||
|
|
||||||
def sigterm(signum, frame):
|
def sigterm(signum, frame):
|
||||||
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
|
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
|
||||||
|
|
|
@ -110,7 +110,8 @@ class Browser:
|
||||||
def abort_browse_page(self):
|
def abort_browse_page(self):
|
||||||
self._abort_browse_page = True
|
self._abort_browse_page = True
|
||||||
|
|
||||||
def browse_page(self, url, on_request=None, on_screenshot=None, on_url_change=None):
|
def browse_page(self, url, extra_headers=None, on_request=None,
|
||||||
|
on_screenshot=None, on_url_change=None):
|
||||||
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
||||||
|
|
||||||
Raises BrowsingException if browsing the page fails in a non-critical
|
Raises BrowsingException if browsing the page fails in a non-critical
|
||||||
|
@ -119,17 +120,16 @@ class Browser:
|
||||||
Returns extracted outlinks.
|
Returns extracted outlinks.
|
||||||
"""
|
"""
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.extra_headers = extra_headers
|
||||||
self.on_request = on_request
|
self.on_request = on_request
|
||||||
|
|
||||||
self.on_screenshot = on_screenshot
|
self.on_screenshot = on_screenshot
|
||||||
self._waiting_on_screenshot_msg_id = None
|
self.on_url_change = on_url_change
|
||||||
|
|
||||||
|
self._waiting_on_screenshot_msg_id = None
|
||||||
|
self._waiting_on_document_url_msg_id = None
|
||||||
self._waiting_on_outlinks_msg_id = None
|
self._waiting_on_outlinks_msg_id = None
|
||||||
self._outlinks = None
|
self._outlinks = None
|
||||||
|
|
||||||
self.on_url_change = on_url_change
|
|
||||||
self._waiting_on_document_url_msg_id = None
|
|
||||||
|
|
||||||
self._websock = websocket.WebSocketApp(self._websocket_url,
|
self._websock = websocket.WebSocketApp(self._websocket_url,
|
||||||
on_open=self._visit_page, on_message=self._handle_message)
|
on_open=self._visit_page, on_message=self._handle_message)
|
||||||
|
|
||||||
|
@ -197,6 +197,9 @@ class Browser:
|
||||||
self.send_to_chrome(method="Debugger.enable")
|
self.send_to_chrome(method="Debugger.enable")
|
||||||
self.send_to_chrome(method="Runtime.enable")
|
self.send_to_chrome(method="Runtime.enable")
|
||||||
|
|
||||||
|
if self.extra_headers:
|
||||||
|
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
|
||||||
|
|
||||||
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
||||||
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ class BrozzlerHQDb:
|
||||||
def schedule_page(self, page, priority=0):
|
def schedule_page(self, page, priority=0):
|
||||||
cursor = self._conn.cursor()
|
cursor = self._conn.cursor()
|
||||||
cursor.execute("insert into brozzler_pages (site_id, priority, canon_url, page_json, in_progress) values (?, ?, ?, ?, 0)",
|
cursor.execute("insert into brozzler_pages (site_id, priority, canon_url, page_json, in_progress) values (?, ?, ?, ?, 0)",
|
||||||
(page.site_id, priority, page.canonical(), page.to_json()))
|
(page.site_id, priority, page.canon_url(), page.to_json()))
|
||||||
self._conn.commit()
|
self._conn.commit()
|
||||||
|
|
||||||
def sites(self):
|
def sites(self):
|
||||||
|
@ -88,7 +88,7 @@ class BrozzlerHQDb:
|
||||||
def update_page(self, page):
|
def update_page(self, page):
|
||||||
cursor = self._conn.cursor()
|
cursor = self._conn.cursor()
|
||||||
# CREATE TABLE brozzler_pages ( id integer primary key, site_id integer, priority integer, in_progress boolean, canon_url varchar(4000), page_json text
|
# CREATE TABLE brozzler_pages ( id integer primary key, site_id integer, priority integer, in_progress boolean, canon_url varchar(4000), page_json text
|
||||||
cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canonical()))
|
cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canon_url()))
|
||||||
row = cursor.fetchone()
|
row = cursor.fetchone()
|
||||||
if row:
|
if row:
|
||||||
# (id, priority, existing_page) = row
|
# (id, priority, existing_page) = row
|
||||||
|
@ -99,7 +99,7 @@ class BrozzlerHQDb:
|
||||||
cursor.execute("update brozzler_pages set priority=?, page_json=? where id=?", (new_priority, existing_page.to_json(), row[0]))
|
cursor.execute("update brozzler_pages set priority=?, page_json=? where id=?", (new_priority, existing_page.to_json(), row[0]))
|
||||||
self._conn.commit()
|
self._conn.commit()
|
||||||
else:
|
else:
|
||||||
raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canonical()))
|
raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canon_url()))
|
||||||
|
|
||||||
class BrozzlerHQ:
|
class BrozzlerHQ:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
|
@ -11,13 +11,15 @@ class Site:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, seed, id=None, scope_surt=None, proxy=None,
|
def __init__(self, seed, id=None, scope_surt=None, proxy=None,
|
||||||
ignore_robots=False, enable_warcprox_features=False, time_limit=None):
|
ignore_robots=False, time_limit=None, extra_headers=None,
|
||||||
|
enable_warcprox_features=False):
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
self.ignore_robots = ignore_robots
|
self.ignore_robots = ignore_robots
|
||||||
self.enable_warcprox_features = enable_warcprox_features
|
self.enable_warcprox_features = enable_warcprox_features
|
||||||
self.time_limit = time_limit
|
self.time_limit = time_limit
|
||||||
|
self.extra_headers = extra_headers
|
||||||
|
|
||||||
if scope_surt:
|
if scope_surt:
|
||||||
self.scope_surt = scope_surt
|
self.scope_surt = scope_surt
|
||||||
|
@ -82,10 +84,10 @@ class Page:
|
||||||
def calc_priority(self):
|
def calc_priority(self):
|
||||||
priority = 0
|
priority = 0
|
||||||
priority += max(0, 10 - self.hops_from_seed)
|
priority += max(0, 10 - self.hops_from_seed)
|
||||||
priority += max(0, 6 - self.canonical().count("/"))
|
priority += max(0, 6 - self.canon_url().count("/"))
|
||||||
return priority
|
return priority
|
||||||
|
|
||||||
def canonical(self):
|
def canon_url(self):
|
||||||
if self._canon_hurl is None:
|
if self._canon_hurl is None:
|
||||||
self._canon_hurl = surt.handyurl.parse(self.url)
|
self._canon_hurl = surt.handyurl.parse(self.url)
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
||||||
|
|
|
@ -113,6 +113,7 @@ class BrozzlerWorker:
|
||||||
self._try_youtube_dl(ydl, site, page)
|
self._try_youtube_dl(ydl, site, page)
|
||||||
|
|
||||||
page.outlinks = browser.browse_page(page.url,
|
page.outlinks = browser.browse_page(page.url,
|
||||||
|
extra_headers=site.extra_headers,
|
||||||
on_screenshot=on_screenshot,
|
on_screenshot=on_screenshot,
|
||||||
on_url_change=page.note_redirect)
|
on_url_change=page.note_redirect)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
kombu
|
kombu
|
||||||
websocket-client-py3==0.13.1
|
websocket-client
|
||||||
argparse
|
argparse
|
||||||
PyYAML
|
PyYAML
|
||||||
git+https://github.com/ikreymer/surt.git@py3
|
git+https://github.com/ikreymer/surt.git@py3
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue