From eabb0fb114e5af634183db0a52a29f3caa9483b6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 21 Dec 2016 18:35:55 -0800 Subject: [PATCH 1/5] restore support for on_response and on_request, with an automated test for on_response --- brozzler/browser.py | 29 ++++++++++++++- tests/htdocs/site3/brozzler.svg | 1 + tests/htdocs/site3/page.html | 9 +++++ tests/test_brozzling.py | 62 +++++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 1 deletion(-) create mode 120000 tests/htdocs/site3/brozzler.svg create mode 100644 tests/htdocs/site3/page.html diff --git a/brozzler/browser.py b/brozzler/browser.py index a947044..9765ab1 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -120,6 +120,9 @@ class WebsockReceiverThread(threading.Thread): self.is_open = False self.got_page_load_event = None + self.on_request = None + self.on_response = None + self._result_messages = {} def expect_result(self, msg_id): @@ -178,14 +181,32 @@ class WebsockReceiverThread(threading.Thread): # resume execution self.websock.send(json.dumps(dict(id=0, method='Debugger.resume'))) + def _network_response_received(self, message): + # if (not self._reached_limit + # and message['params']['response']['status'] == 420 + # and 'Warcprox-Meta' in CaseInsensitiveDict( + # message['params']['response']['headers'])): + # warcprox_meta = json.loads(CaseInsensitiveDict( + # message['params']['response']['headers'])['Warcprox-Meta']) + # self._reached_limit = brozzler.ReachedLimit( + # warcprox_meta=warcprox_meta) + # self.logger.info('reached limit %s', self._reached_limit) + if self.on_response: + self.on_response(message) + def _handle_message(self, websock, json_message): message = json.loads(json_message) if 'method' in message: if message['method'] == 'Page.loadEventFired': self.got_page_load_event = datetime.datetime.utcnow() + elif message['method'] == 'Network.responseReceived': + self._network_response_received(message) + elif message['method'] == 'Network.requestWillBeSent': + if self.on_request: + self.on_request(message) elif message['method'] == 'Debugger.paused': self._debugger_paused(message) - elif message["method"] == "Inspector.targetCrashed": + elif message['method'] == 'Inspector.targetCrashed': self.logger.error( '''chrome tab went "aw snap" or "he's dead jim"!''') brozzler.thread_raise(self.calling_thread, BrowsingException) @@ -375,6 +396,10 @@ class Browser: if self.is_browsing: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True + if on_request: + self.websock_thread.on_request = on_request + if on_response: + self.websock_thread.on_response = on_response try: self.navigate_to_page( page_url, extra_headers=extra_headers, @@ -402,6 +427,8 @@ class Browser: raise BrowsingException(e) finally: self.is_browsing = False + self.websock_thread.on_request = None + self.websock_thread.on_response = None def navigate_to_page( self, page_url, extra_headers=None, user_agent=None, timeout=300): diff --git a/tests/htdocs/site3/brozzler.svg b/tests/htdocs/site3/brozzler.svg new file mode 120000 index 0000000..5069ef8 --- /dev/null +++ b/tests/htdocs/site3/brozzler.svg @@ -0,0 +1 @@ +../../../brozzler/dashboard/static/brozzler.svg \ No newline at end of file diff --git a/tests/htdocs/site3/page.html b/tests/htdocs/site3/page.html new file mode 100644 index 0000000..64d4e1a --- /dev/null +++ b/tests/htdocs/site3/page.html @@ -0,0 +1,9 @@ + + + some simple html + + +

an image

+ + + diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index b41278d..ef7b0d5 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -19,9 +19,71 @@ limitations under the License. import pytest import brozzler +import logging +import os +import http.server +import threading +import argparse + +args = argparse.Namespace() +args.log_level = logging.INFO +brozzler.cli._configure_logging(args) + +@pytest.fixture(scope='module') +def httpd(request): + # SimpleHTTPRequestHandler always uses CWD so we have to chdir + os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + + httpd = http.server.HTTPServer( + ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + def fin(): + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + request.addfinalizer(fin) + + return httpd + +def test_httpd(httpd): + ''' + Tests that our http server is working as expected, and that two fetches + of the same url return the same payload, proving it can be used to test + deduplication. + ''' + payload1 = content2 = None + url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port + with urllib.request.urlopen(url) as response: + assert response.status == 200 + payload1 = response.read() + assert payload1 + + with urllib.request.urlopen(url) as response: + assert response.status == 200 + payload2 = response.read() + assert payload2 + + assert payload1 == payload2 def test_aw_snap_hes_dead_jim(): chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.BrowsingException): browser.browse_page('chrome://crash') + +def test_on_response(httpd): + response_urls = [] + def on_response(msg): + response_urls.append(msg['params']['response']['url']) + + chrome_exe = brozzler.suggest_default_chrome_exe() + url = 'http://localhost:%s/site3/page.html' % httpd.server_port + with brozzler.Browser(chrome_exe=chrome_exe) as browser: + browser.browse_page(url, on_response=on_response) + browser.browse_page(url) + assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port + assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port + assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port + From e5fb6cb4b9633c05afc956759b843b1a6693f90d Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 21 Dec 2016 19:19:34 -0800 Subject: [PATCH 2/5] add import missing from test --- setup.py | 2 +- tests/test_brozzling.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 618c0c8..c464b8a 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev154', + version='1.1b9.dev155', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index ef7b0d5..870ff02 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -24,6 +24,7 @@ import os import http.server import threading import argparse +import urllib args = argparse.Namespace() args.log_level = logging.INFO From 70b67942a543bf0b61a7953e09226607cb489fa2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Dec 2016 13:44:09 -0800 Subject: [PATCH 3/5] restore handling of 420 Reached limit, with a rudimentary test --- brozzler/browser.py | 30 ++++++++++++++++++++--------- setup.py | 2 +- tests/test_brozzling.py | 42 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 12 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index 9765ab1..214ea4f 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -119,6 +119,7 @@ class WebsockReceiverThread(threading.Thread): self.is_open = False self.got_page_load_event = None + self.reached_limit = None self.on_request = None self.on_response = None @@ -182,15 +183,22 @@ class WebsockReceiverThread(threading.Thread): self.websock.send(json.dumps(dict(id=0, method='Debugger.resume'))) def _network_response_received(self, message): - # if (not self._reached_limit - # and message['params']['response']['status'] == 420 - # and 'Warcprox-Meta' in CaseInsensitiveDict( - # message['params']['response']['headers'])): - # warcprox_meta = json.loads(CaseInsensitiveDict( - # message['params']['response']['headers'])['Warcprox-Meta']) - # self._reached_limit = brozzler.ReachedLimit( - # warcprox_meta=warcprox_meta) - # self.logger.info('reached limit %s', self._reached_limit) + if (message['params']['response']['status'] == 420 + and 'Warcprox-Meta' in CaseInsensitiveDict( + message['params']['response']['headers'])): + if not self.reached_limit: + warcprox_meta = json.loads(CaseInsensitiveDict( + message['params']['response']['headers'])['Warcprox-Meta']) + self.reached_limit = brozzler.ReachedLimit( + warcprox_meta=warcprox_meta) + self.logger.info('reached limit %s', self.reached_limit) + brozzler.thread_raise( + self.calling_thread, brozzler.ReachedLimit) + else: + self.logger.info( + 'reached limit but self.reached_limit is already set, ' + 'assuming the calling thread is already handling this', + self.reached_limit) if self.on_response: self.on_response(message) @@ -422,6 +430,10 @@ class Browser: ## outlinks += retrieve_outlinks (60 sec) final_page_url = self.url() return final_page_url, outlinks + except brozzler.ReachedLimit: + # websock_thread has stashed the ReachedLimit exception with + # more information, raise that one + raise self.websock_thread.reached_limit except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') raise BrowsingException(e) diff --git a/setup.py b/setup.py index c464b8a..8f88b8c 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev155', + version='1.1b9.dev156', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 870ff02..563f106 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -25,18 +25,44 @@ import http.server import threading import argparse import urllib +import json args = argparse.Namespace() args.log_level = logging.INFO brozzler.cli._configure_logging(args) +WARCPROX_META_420 = { + 'stats': { + 'test_limits_bucket': { + 'total': {'urls': 0, 'wire_bytes': 0}, + 'new': {'urls': 0, 'wire_bytes': 0}, + 'revisit': {'urls': 0, 'wire_bytes': 0}, + 'bucket': 'test_limits_bucket' + } + }, + 'reached-limit': {'test_limits_bucket/total/urls': 0} +} + @pytest.fixture(scope='module') def httpd(request): + class RequestHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + if self.path == '/420': + self.send_response(420, 'Reached limit') + self.send_header('Connection', 'close') + self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420)) + payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n' + self.send_header('Content-Type', 'text/plain;charset=utf-8') + self.send_header('Content-Length', len(payload)) + self.end_headers() + self.wfile.write(payload) + else: + super().do_GET() + # SimpleHTTPRequestHandler always uses CWD so we have to chdir os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) - httpd = http.server.HTTPServer( - ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd = http.server.HTTPServer(('localhost', 0), RequestHandler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() @@ -68,6 +94,11 @@ def test_httpd(httpd): assert payload1 == payload2 + url = 'http://localhost:%s/420' % httpd.server_port + with pytest.raises(urllib.error.HTTPError) as excinfo: + urllib.request.urlopen(url) + assert excinfo.value.getcode() == 420 + def test_aw_snap_hes_dead_jim(): chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: @@ -88,3 +119,10 @@ def test_on_response(httpd): assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port +def test_420(httpd): + chrome_exe = brozzler.suggest_default_chrome_exe() + url = 'http://localhost:%s/420' % httpd.server_port + with brozzler.Browser(chrome_exe=chrome_exe) as browser: + with pytest.raises(brozzler.ReachedLimit) as excinfo: + browser.browse_page(url) + assert excinfo.value.warcprox_meta == WARCPROX_META_420 From c2704b18be37b23746e9731c1ba22189538158d2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 4 Jan 2017 14:57:34 -0800 Subject: [PATCH 4/5] restore BrozzlerWorker built-in support for managing its own thread --- brozzler/worker.py | 28 ++++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 90a6442..5eb3128 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -109,6 +109,9 @@ class BrozzlerWorker: self._browsing_threads = set() self._browsing_threads_lock = threading.Lock() + self._thread = None + self._start_stop_lock = threading.Lock() + def _proxy(self, site): if site.proxy: return site.proxy @@ -458,3 +461,28 @@ class BrozzlerWorker: for th in thredz: th.join() + def start(self): + with self._start_stop_lock: + if self._thread: + self.logger.warn( + 'ignoring start request because self._thread is ' + 'not None') + return + self._thread = threading.Thread( + target=self.run, name="BrozzlerWorker") + self._thread.start() + + def shutdown_now(self): + self.stop() + + def stop(self): + with self._start_stop_lock: + if self._thread and self._thread.is_alive(): + self.logger.info("brozzler worker shutting down") + brozzler.thread_raise(self._thread, brozzler.ShutdownRequested) + self._thread.join() + self._thread = None + + def is_alive(self): + return self._thread and self._thread.is_alive() + diff --git a/setup.py b/setup.py index 8f88b8c..54341fa 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev156', + version='1.1b9.dev157', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 76b658747e890992a35dc92c175ea208c59a0bf6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 6 Jan 2017 13:03:09 -0800 Subject: [PATCH 5/5] fix oversight including username/password in site config when starting a new job --- brozzler/job.py | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/brozzler/job.py b/brozzler/job.py index a178884..ede81b8 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -75,8 +75,6 @@ def new_job(frontier, job_conf): sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) - if "login" in merged_conf and "metadata" in merged_conf: - merged_conf["metadata"]["login"] = merged_conf["login"] site = brozzler.Site( job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), @@ -89,7 +87,9 @@ def new_job(frontier, job_conf): metadata=merged_conf.get("metadata"), remember_outlinks=merged_conf.get("remember_outlinks"), user_agent=merged_conf.get("user_agent"), - behavior_parameters=merged_conf.get("behavior_parameters")) + behavior_parameters=merged_conf.get("behavior_parameters"), + username=merged_conf.get("username"), + password=merged_conf.get("password")) sites.append(site) # insert all the sites into database before the job diff --git a/setup.py b/setup.py index 54341fa..2bbb018 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev157', + version='1.1b9.dev158', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',