support adding extra http request headers

This commit is contained in:
Noah Levitt 2015-07-17 13:45:27 -07:00
parent c178ed1950
commit 2ba5bd4d4b
6 changed files with 21 additions and 13 deletions

View File

@ -30,6 +30,8 @@ args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
# the way we're using it, amqp is too verbose at debug level
logging.getLogger("amqp").setLevel(logging.INFO)
def sigterm(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')

View File

@ -110,7 +110,8 @@ class Browser:
def abort_browse_page(self):
self._abort_browse_page = True
def browse_page(self, url, on_request=None, on_screenshot=None, on_url_change=None):
def browse_page(self, url, extra_headers=None, on_request=None,
on_screenshot=None, on_url_change=None):
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
Raises BrowsingException if browsing the page fails in a non-critical
@ -119,17 +120,16 @@ class Browser:
Returns extracted outlinks.
"""
self.url = url
self.extra_headers = extra_headers
self.on_request = on_request
self.on_screenshot = on_screenshot
self._waiting_on_screenshot_msg_id = None
self.on_url_change = on_url_change
self._waiting_on_screenshot_msg_id = None
self._waiting_on_document_url_msg_id = None
self._waiting_on_outlinks_msg_id = None
self._outlinks = None
self.on_url_change = on_url_change
self._waiting_on_document_url_msg_id = None
self._websock = websocket.WebSocketApp(self._websocket_url,
on_open=self._visit_page, on_message=self._handle_message)
@ -197,6 +197,9 @@ class Browser:
self.send_to_chrome(method="Debugger.enable")
self.send_to_chrome(method="Runtime.enable")
if self.extra_headers:
self.send_to_chrome(method="Network.setExtraHTTPHeaders", params={"headers":self.extra_headers})
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})

View File

@ -71,7 +71,7 @@ class BrozzlerHQDb:
def schedule_page(self, page, priority=0):
cursor = self._conn.cursor()
cursor.execute("insert into brozzler_pages (site_id, priority, canon_url, page_json, in_progress) values (?, ?, ?, ?, 0)",
(page.site_id, priority, page.canonical(), page.to_json()))
(page.site_id, priority, page.canon_url(), page.to_json()))
self._conn.commit()
def sites(self):
@ -88,7 +88,7 @@ class BrozzlerHQDb:
def update_page(self, page):
cursor = self._conn.cursor()
# CREATE TABLE brozzler_pages ( id integer primary key, site_id integer, priority integer, in_progress boolean, canon_url varchar(4000), page_json text
cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canonical()))
cursor.execute("select id, priority, page_json from brozzler_pages where site_id=? and canon_url=?", (page.site_id, page.canon_url()))
row = cursor.fetchone()
if row:
# (id, priority, existing_page) = row
@ -99,7 +99,7 @@ class BrozzlerHQDb:
cursor.execute("update brozzler_pages set priority=?, page_json=? where id=?", (new_priority, existing_page.to_json(), row[0]))
self._conn.commit()
else:
raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canonical()))
raise KeyError("page not in brozzler_pages site_id={} canon_url={}".format(page.site_id, page.canon_url()))
class BrozzlerHQ:
logger = logging.getLogger(__module__ + "." + __qualname__)

View File

@ -11,13 +11,15 @@ class Site:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, seed, id=None, scope_surt=None, proxy=None,
ignore_robots=False, enable_warcprox_features=False, time_limit=None):
ignore_robots=False, time_limit=None, extra_headers=None,
enable_warcprox_features=False):
self.seed = seed
self.id = id
self.proxy = proxy
self.ignore_robots = ignore_robots
self.enable_warcprox_features = enable_warcprox_features
self.time_limit = time_limit
self.extra_headers = extra_headers
if scope_surt:
self.scope_surt = scope_surt
@ -82,10 +84,10 @@ class Page:
def calc_priority(self):
priority = 0
priority += max(0, 10 - self.hops_from_seed)
priority += max(0, 6 - self.canonical().count("/"))
priority += max(0, 6 - self.canon_url().count("/"))
return priority
def canonical(self):
def canon_url(self):
if self._canon_hurl is None:
self._canon_hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)

View File

@ -113,6 +113,7 @@ class BrozzlerWorker:
self._try_youtube_dl(ydl, site, page)
page.outlinks = browser.browse_page(page.url,
extra_headers=site.extra_headers,
on_screenshot=on_screenshot,
on_url_change=page.note_redirect)

View File

@ -1,5 +1,5 @@
kombu
websocket-client-py3==0.13.1
websocket-client
argparse
PyYAML
git+https://github.com/ikreymer/surt.git@py3