support adding extra http request headers

This commit is contained in:
Noah Levitt 2015-07-17 13:45:27 -07:00
parent c178ed1950
commit 2ba5bd4d4b
6 changed files with 21 additions and 13 deletions

View file

@ -30,6 +30,8 @@ args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level, logging.basicConfig(stream=sys.stdout, level=args.log_level,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
# the way we're using it, amqp is too verbose at debug level
logging.getLogger("amqp").setLevel(logging.INFO)
def sigterm(signum, frame): def sigterm(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)') raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')

View file

@ -110,7 +110,8 @@ class Browser:
def abort_browse_page(self): def abort_browse_page(self):
self._abort_browse_page = True self._abort_browse_page = True
def browse_page(self, url, on_request=None, on_screenshot=None, on_url_change=None): def browse_page(self, url, extra_headers=None, on_request=None,
on_screenshot=None, on_url_change=None):
"""Synchronously loads a page, takes a screenshot, and runs behaviors. """Synchronously loads a page, takes a screenshot, and runs behaviors.
Raises BrowsingException if browsing the page fails in a non-critical Raises BrowsingException if browsing the page fails in a non-critical
@ -119,17 +120,16 @@ class Browser:
Returns extracted outlinks. Returns extracted outlinks.
""" """
self.url = url self.url = url
self.extra_headers = extra_headers
self.on_request = on_request self.on_request = on_request
self.on_screenshot = on_screenshot self.on_screenshot = on_screenshot
self._waiting_on_screenshot_msg_id = None self.on_url_change = on_url_change
self._waiting_on_screenshot_msg_id = None
self._waiting_on_document_url_msg_id = None
self._waiting_on_outlinks_msg_id = None self._waiting_on_outlinks_msg_id = None
self._outlinks = None self._outlinks = None
self.on_url_change = on_url_change
self._waiting_on_document_url_msg_id = None
self._websock = websocket.WebSocketApp(self._websocket_url, self._websock = websocket.WebSocketApp(self._websocket_url,
on_open=self._visit_page, on_message=self._handle_message) on_open=self._visit_page, on_message=self._handle_message)
@ -197,6 +197,9 @@ class Browser:
self.send_to_chrome(method="Debugger.enable") self.send_to_chrome(method="Debugger.enable")
self.send_to_chrome(method="Runtime.enable") self.send_to_chrome(method="Runtime.enable")
if self.extra_headers:
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused" # disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"}) self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

View file

@ -71,7 +71,7 @@ class BrozzlerHQDb:
def schedule_page(self, page, priority=0): def schedule_page(self, page, priority=0):
cursor = self._conn.cursor() cursor = self._conn.cursor()
cursor.execute("insert into brozzler_pages (site_id, priority, canon_url, page_json, in_progress) values (?, ?, ?, ?, 0)", cursor.execute("insert into brozzler_pages (site_id, priority, canon_url, page_json, in_progress) values (?, ?, ?, ?, 0)",
(page.site_id, priority, page.canonical(), page.to_json())) (page.site_id, priority, page.canon_url(), page.to_json()))
self._conn.commit() self._conn.commit()
def sites(self): def sites(self):
@ -88,7 +88,7 @@ class BrozzlerHQDb:
def update_page(self, page): def update_page(self, page):
cursor = self._conn.cursor() cursor = self._conn.cursor()
# CREATE TABLE brozzler_pages ( id integer primary key, site_id integer, priority integer, in_progress boolean, canon_url varchar(4000), page_json text # CREATE TABLE brozzler_pages ( id integer primary key, site_id integer, priority integer, in_progress boolean, canon_url varchar(4000), page_json text
cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canonical())) cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canon_url()))
row = cursor.fetchone() row = cursor.fetchone()
if row: if row:
# (id, priority, existing_page) = row # (id, priority, existing_page) = row
@ -99,7 +99,7 @@ class BrozzlerHQDb:
cursor.execute("update brozzler_pages set priority=?, page_json=? where id=?", (new_priority, existing_page.to_json(), row[0])) cursor.execute("update brozzler_pages set priority=?, page_json=? where id=?", (new_priority, existing_page.to_json(), row[0]))
self._conn.commit() self._conn.commit()
else: else:
raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canonical())) raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canon_url()))
class BrozzlerHQ: class BrozzlerHQ:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)

View file

@ -11,13 +11,15 @@ class Site:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, seed, id=None, scope_surt=None, proxy=None, def __init__(self, seed, id=None, scope_surt=None, proxy=None,
ignore_robots=False, enable_warcprox_features=False, time_limit=None): ignore_robots=False, time_limit=None, extra_headers=None,
enable_warcprox_features=False):
self.seed = seed self.seed = seed
self.id = id self.id = id
self.proxy = proxy self.proxy = proxy
self.ignore_robots = ignore_robots self.ignore_robots = ignore_robots
self.enable_warcprox_features = enable_warcprox_features self.enable_warcprox_features = enable_warcprox_features
self.time_limit = time_limit self.time_limit = time_limit
self.extra_headers = extra_headers
if scope_surt: if scope_surt:
self.scope_surt = scope_surt self.scope_surt = scope_surt
@ -82,10 +84,10 @@ class Page:
def calc_priority(self): def calc_priority(self):
priority = 0 priority = 0
priority += max(0, 10 - self.hops_from_seed) priority += max(0, 10 - self.hops_from_seed)
priority += max(0, 6 - self.canonical().count("/")) priority += max(0, 6 - self.canon_url().count("/"))
return priority return priority
def canonical(self): def canon_url(self):
if self._canon_hurl is None: if self._canon_hurl is None:
self._canon_hurl = surt.handyurl.parse(self.url) self._canon_hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl) surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)

View file

@ -113,6 +113,7 @@ class BrozzlerWorker:
self._try_youtube_dl(ydl, site, page) self._try_youtube_dl(ydl, site, page)
page.outlinks = browser.browse_page(page.url, page.outlinks = browser.browse_page(page.url,
extra_headers=site.extra_headers,
on_screenshot=on_screenshot, on_screenshot=on_screenshot,
on_url_change=page.note_redirect) on_url_change=page.note_redirect)

View file

@ -1,5 +1,5 @@
kombu kombu
websocket-client-py3==0.13.1 websocket-client
argparse argparse
PyYAML PyYAML
git+https://github.com/ikreymer/surt.git@py3 git+https://github.com/ikreymer/surt.git@py3