mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
support adding extra http request headers
This commit is contained in:
parent
c178ed1950
commit
2ba5bd4d4b
@ -30,6 +30,8 @@ args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
# the way we're using it, amqp is too verbose at debug level
|
||||
logging.getLogger("amqp").setLevel(logging.INFO)
|
||||
|
||||
def sigterm(signum, frame):
|
||||
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
|
||||
|
@ -110,7 +110,8 @@ class Browser:
|
||||
def abort_browse_page(self):
|
||||
self._abort_browse_page = True
|
||||
|
||||
def browse_page(self, url, on_request=None, on_screenshot=None, on_url_change=None):
|
||||
def browse_page(self, url, extra_headers=None, on_request=None,
|
||||
on_screenshot=None, on_url_change=None):
|
||||
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
||||
|
||||
Raises BrowsingException if browsing the page fails in a non-critical
|
||||
@ -119,17 +120,16 @@ class Browser:
|
||||
Returns extracted outlinks.
|
||||
"""
|
||||
self.url = url
|
||||
self.extra_headers = extra_headers
|
||||
self.on_request = on_request
|
||||
|
||||
self.on_screenshot = on_screenshot
|
||||
self._waiting_on_screenshot_msg_id = None
|
||||
self.on_url_change = on_url_change
|
||||
|
||||
self._waiting_on_screenshot_msg_id = None
|
||||
self._waiting_on_document_url_msg_id = None
|
||||
self._waiting_on_outlinks_msg_id = None
|
||||
self._outlinks = None
|
||||
|
||||
self.on_url_change = on_url_change
|
||||
self._waiting_on_document_url_msg_id = None
|
||||
|
||||
self._websock = websocket.WebSocketApp(self._websocket_url,
|
||||
on_open=self._visit_page, on_message=self._handle_message)
|
||||
|
||||
@ -197,6 +197,9 @@ class Browser:
|
||||
self.send_to_chrome(method="Debugger.enable")
|
||||
self.send_to_chrome(method="Runtime.enable")
|
||||
|
||||
if self.extra_headers:
|
||||
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
|
||||
|
||||
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
||||
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
||||
|
||||
|
@ -71,7 +71,7 @@ class BrozzlerHQDb:
|
||||
def schedule_page(self, page, priority=0):
|
||||
cursor = self._conn.cursor()
|
||||
cursor.execute("insert into brozzler_pages (site_id, priority, canon_url, page_json, in_progress) values (?, ?, ?, ?, 0)",
|
||||
(page.site_id, priority, page.canonical(), page.to_json()))
|
||||
(page.site_id, priority, page.canon_url(), page.to_json()))
|
||||
self._conn.commit()
|
||||
|
||||
def sites(self):
|
||||
@ -88,7 +88,7 @@ class BrozzlerHQDb:
|
||||
def update_page(self, page):
|
||||
cursor = self._conn.cursor()
|
||||
# CREATE TABLE brozzler_pages ( id integer primary key, site_id integer, priority integer, in_progress boolean, canon_url varchar(4000), page_json text
|
||||
cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canonical()))
|
||||
cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canon_url()))
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
# (id, priority, existing_page) = row
|
||||
@ -99,7 +99,7 @@ class BrozzlerHQDb:
|
||||
cursor.execute("update brozzler_pages set priority=?, page_json=? where id=?", (new_priority, existing_page.to_json(), row[0]))
|
||||
self._conn.commit()
|
||||
else:
|
||||
raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canonical()))
|
||||
raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canon_url()))
|
||||
|
||||
class BrozzlerHQ:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
@ -11,13 +11,15 @@ class Site:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, seed, id=None, scope_surt=None, proxy=None,
|
||||
ignore_robots=False, enable_warcprox_features=False, time_limit=None):
|
||||
ignore_robots=False, time_limit=None, extra_headers=None,
|
||||
enable_warcprox_features=False):
|
||||
self.seed = seed
|
||||
self.id = id
|
||||
self.proxy = proxy
|
||||
self.ignore_robots = ignore_robots
|
||||
self.enable_warcprox_features = enable_warcprox_features
|
||||
self.time_limit = time_limit
|
||||
self.extra_headers = extra_headers
|
||||
|
||||
if scope_surt:
|
||||
self.scope_surt = scope_surt
|
||||
@ -82,10 +84,10 @@ class Page:
|
||||
def calc_priority(self):
|
||||
priority = 0
|
||||
priority += max(0, 10 - self.hops_from_seed)
|
||||
priority += max(0, 6 - self.canonical().count("/"))
|
||||
priority += max(0, 6 - self.canon_url().count("/"))
|
||||
return priority
|
||||
|
||||
def canonical(self):
|
||||
def canon_url(self):
|
||||
if self._canon_hurl is None:
|
||||
self._canon_hurl = surt.handyurl.parse(self.url)
|
||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
||||
|
@ -113,6 +113,7 @@ class BrozzlerWorker:
|
||||
self._try_youtube_dl(ydl, site, page)
|
||||
|
||||
page.outlinks = browser.browse_page(page.url,
|
||||
extra_headers=site.extra_headers,
|
||||
on_screenshot=on_screenshot,
|
||||
on_url_change=page.note_redirect)
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
kombu
|
||||
websocket-client-py3==0.13.1
|
||||
websocket-client
|
||||
argparse
|
||||
PyYAML
|
||||
git+https://github.com/ikreymer/surt.git@py3
|
||||
|
Loading…
x
Reference in New Issue
Block a user