diff --git a/brozzler/__init__.py b/brozzler/__init__.py index af6ec3d..46a0704 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -44,7 +44,8 @@ class ReachedLimit(Exception): self.http_payload = http_payload def __repr__(self): - return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload)) + return "ReachedLimit(warcprox_meta=%s,http_payload=%s)" % ( + repr(self.warcprox_meta), repr(self.http_payload)) def __str__(self): return self.__repr__() @@ -169,6 +170,16 @@ def sleep(duration): break time.sleep(min(duration - elapsed, 0.5)) +_jinja2_env = None +def jinja2_environment(): + global _jinja2_env + if not _jinja2_env: + import jinja2, json + _jinja2_env = jinja2.Environment( + loader=jinja2.PackageLoader('brozzler', 'js-templates')) + _jinja2_env.filters['json'] = json.dumps + return _jinja2_env + from brozzler.site import Page, Site from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots diff --git a/brozzler/browser.py b/brozzler/browser.py index d412188..db54130 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -179,7 +179,6 @@ class WebsockReceiverThread(threading.Thread): self.websock.send(json.dumps(dict(id=0, method='Debugger.resume'))) def _handle_message(self, websock, json_message): - self.logger.debug("%s", json_message) message = json.loads(json_message) if 'method' in message: if message['method'] == 'Page.loadEventFired': @@ -191,16 +190,15 @@ class WebsockReceiverThread(threading.Thread): '%s console.%s %s', self.websock.url, message['params']['message']['level'], message['params']['message']['text']) - # else: - # self.logger.debug("%s %s", message["method"], json_message) + # else: + # self.logger.debug("%s %s", message["method"], json_message) elif 'result' in message: if message['id'] in self._result_messages: self._result_messages[message['id']] = message - # else: - # self.logger.debug("%s", json_message) - # else: - # self.logger.debug("%s", json_message) - + # else: + # self.logger.debug("%s", json_message) + # else: + # self.logger.debug("%s", json_message) class Browser: ''' @@ -235,7 +233,6 @@ class Browser: ''' start = time.time() while True: - brozzler.sleep(0.5) if callback(): return elapsed = time.time() - start @@ -243,6 +240,7 @@ class Browser: raise BrowsingTimeout( 'timed out after %.1fs waiting for: %s' % ( elapsed, callback)) + brozzler.sleep(0.5) def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self._command_id) @@ -329,7 +327,8 @@ class Browser: def browse_page( self, page_url, ignore_cert_errors=False, extra_headers=None, user_agent=None, behavior_parameters=None, - on_request=None, on_response=None, on_screenshot=None): + on_request=None, on_response=None, on_screenshot=None, + username=None, password=None): ''' Browses page in browser. @@ -373,9 +372,11 @@ class Browser: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True try: - self.navigate_to_page(page_url, timeout=300) - ## if login_credentials: - ## self.try_login(login_credentials) (5 min?) + self.navigate_to_page( + page_url, extra_headers=extra_headers, + user_agent=user_agent, timeout=300) + if password: + self.try_login(username, password, timeout=300) behavior_script = brozzler.behavior_script( page_url, behavior_parameters) self.run_behavior(behavior_script, timeout=900) @@ -450,7 +451,7 @@ __brzl_compileOutlinks(window).join('\n'); if message['result']['result']['value']: return frozenset(message['result']['result']['value'].split('\n')) else: - self._outlinks = frozenset() + return frozenset() def screenshot(self, timeout=30): self.logger.info('taking screenshot') @@ -523,6 +524,101 @@ __brzl_compileOutlinks(window).join('\n'); except BrowsingTimeout: pass + TRY_LOGIN_JS_J2 = ''' +var __brzl_tryLoginState = 'trying'; + +var __brzl_tryLogin = function() { + for (var i = 0; i < document.forms.length; i++) { + var form = document.forms[i]; + if (form.method != 'post') { + continue; + } + var usernameField, passwordField; + for (var j = 0; j < form.elements.length; j++) { + var field = form.elements[j]; + if (field.type == 'text') { + if (!usernameField) { + usernameField = field; + } else { + usernameField = undefined; + break; + } + } else if (field.type == 'password') { + if (!passwordField) { + passwordField = field; + } else { + passwordField = undefined; + break; + } + } else if (field.type == 'textarea') { + usernameField = undefined; + passwordField = undefined; + break; + } + + } + if (usernameField && passwordField) { + usernameField.value = {{username|json}}; + passwordField.value = {{password|json}}; + form.submit() + __brzl_tryLoginState = 'submitted-form'; + return + } + } + __brzl_tryLoginState = 'login-form-not-found'; +}; + +__brzl_tryLogin(); +''' + def try_login(self, username, password, timeout=300): + try_login_js = brozzler.jinja2_environment().from_string( + self.TRY_LOGIN_JS_J2).render( + username=username, password=password) + + self.websock_thread.got_page_load_event = None + self.send_to_chrome( + method='Runtime.evaluate', suppress_logging=True, + params={'expression': try_login_js}) + + # wait for tryLogin to finish trying (should be very very quick) + start = time.time() + while True: + self.websock_thread.expect_result(self._command_id.peek()) + msg_id = self.send_to_chrome( + method='Runtime.evaluate', + params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'}) + try: + self._wait_for( + lambda: self.websock_thread.received_result(msg_id), + timeout=5) + msg = self.websock_thread.pop_result(msg_id) + if (msg and 'result' in msg + and 'result' in msg['result']): + result = msg['result']['result']['value'] + if result == 'login-form-not-found': + # we're done + return + elif result in ('submitted-form', 'maybe-submitted-form'): + # wait for page load event below + self.logger.info( + 'submitted a login form, waiting for another ' + 'page load event') + break + # else try again to get __brzl_tryLoginState + + except BrowsingTimeout: + pass + + if time.time() - start > 30: + raise BrowsingException( + 'timed out trying to check if tryLogin finished') + + # if we get here, we submitted a form, now we wait for another page + # load event + self._wait_for( + lambda: self.websock_thread.got_page_load_event, + timeout=timeout) + class Counter: def __init__(self): self.next_value = 0 diff --git a/brozzler/cli.py b/brozzler/cli.py index 7de298e..02bca24 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -126,6 +126,12 @@ def brozzle_page(): 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"x",' '"parameter_password":"y"}')) + arg_parser.add_argument( + '--username', dest='username', default=None, + help='use this username to try to log in if a login form is found') + arg_parser.add_argument( + '--password', dest='password', default=None, + help='use this password to try to log in if a login form is found') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') @@ -145,7 +151,8 @@ def brozzle_page(): site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features, - behavior_parameters=behavior_parameters) + behavior_parameters=behavior_parameters, username=args.username, + password=args.password) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) @@ -230,6 +237,12 @@ def brozzler_new_site(): 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"x",' '"parameter_password":"y"}')) + arg_parser.add_argument( + '--username', dest='username', default=None, + help='use this username to try to log in if a login form is found') + arg_parser.add_argument( + '--password', dest='password', default=None, + help='use this password to try to log in if a login form is found') _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) @@ -243,7 +256,8 @@ def brozzler_new_site(): warcprox_meta=json.loads( args.warcprox_meta) if args.warcprox_meta else None, behavior_parameters=json.loads( - args.behavior_parameters) if args.behavior_parameters else None) + args.behavior_parameters) if args.behavior_parameters else None, + username=args.username, password=args.password) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) diff --git a/brozzler/site.py b/brozzler/site.py index 8ff692a..d0c0f48 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -95,7 +95,8 @@ class Site(brozzler.BaseDictable): status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, - cookie_db=None, user_agent=None, behavior_parameters=None): + cookie_db=None, user_agent=None, behavior_parameters=None, + username=None, password=None): self.seed = seed self.id = id @@ -117,6 +118,8 @@ class Site(brozzler.BaseDictable): self.cookie_db = cookie_db self.user_agent = user_agent self.behavior_parameters = behavior_parameters + self.username = username + self.password = password self.scope = scope or {} if not "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index 49626ad..90a6442 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -277,6 +277,7 @@ class BrozzlerWorker: final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.behavior_parameters, + username=site.username, password=site.password, user_agent=site.user_agent, on_screenshot=_on_screenshot) if final_page_url != page.url: diff --git a/setup.py b/setup.py index de4b25b..555505d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev145', + version='1.1b9.dev146', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -69,6 +69,7 @@ setuptools.setup( 'rethinkdb>=2.3,<2.4', 'psutil==4.3.0', 'cerberus==1.0.1', + 'jinja2', ], extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], @@ -80,6 +81,7 @@ setuptools.setup( 'Environment :: Console', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Internet :: WWW/HTTP', 'Topic :: System :: Archiving', ]) diff --git a/tests/htdocs/file1.txt b/tests/htdocs/file1.txt deleted file mode 100644 index d4a2f1c..0000000 --- a/tests/htdocs/file1.txt +++ /dev/null @@ -1 +0,0 @@ -I'm a plain text file. diff --git a/tests/test_cluster.py b/tests/test_cluster.py index ef4c51a..829790b 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -62,14 +62,13 @@ def test_httpd(httpd): deduplication. ''' payload1 = content2 = None - with urllib.request.urlopen( - 'http://localhost:%s/file1.txt' % httpd.server_port) as response: + url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port + with urllib.request.urlopen(url) as response: assert response.status == 200 payload1 = response.read() assert payload1 - with urllib.request.urlopen( - 'http://localhost:%s/file1.txt' % httpd.server_port) as response: + with urllib.request.urlopen(url) as response: assert response.status == 200 payload2 = response.read() assert payload2 @@ -101,13 +100,14 @@ def test_services_up(): def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site( - seed='http://localhost:%s/' % httpd.server_port, + seed='http://localhost:%s/site1/' % httpd.server_port, proxy='localhost:8000', enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) # the two pages we expect to be crawled - page1 = 'http://localhost:%s/' % httpd.server_port - page2 = 'http://localhost:%s/file1.txt' % httpd.server_port + page1 = 'http://localhost:%s/site1/' % httpd.server_port + page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port + robots = 'http://localhost:%s/robots.txt' % httpd.server_port # so we can examine rethinkdb before it does anything try: @@ -131,19 +131,18 @@ def test_brozzle_site(httpd): # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) - assert len(pages) == 3 + assert len(pages) == 2 assert {page.url for page in pages} == { - 'http://localhost:%s/' % httpd.server_port, - 'http://localhost:%s/robots.txt' % httpd.server_port, - 'http://localhost:%s/file1.txt' % httpd.server_port} + 'http://localhost:%s/site1/' % httpd.server_port, + 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} + assert robots in captures_by_url assert page1 in captures_by_url - assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url @@ -153,7 +152,7 @@ def test_brozzle_site(httpd): t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( - os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() + os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload def test_warcprox_selection(httpd): @@ -163,11 +162,12 @@ def test_warcprox_selection(httpd): test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat() # the two pages we expect to be crawled - page1 = 'http://localhost:%s/' % httpd.server_port - page2 = 'http://localhost:%s/file1.txt' % httpd.server_port + page1 = 'http://localhost:%s/site1/' % httpd.server_port + page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port + robots = 'http://localhost:%s/robots.txt' % httpd.server_port site = brozzler.Site( - seed='http://localhost:%s/' % httpd.server_port, + seed='http://localhost:%s/site1/' % httpd.server_port, enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) @@ -199,19 +199,18 @@ def test_warcprox_selection(httpd): # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) - assert len(pages) == 3 + assert len(pages) == 2 assert {page.url for page in pages} == { - 'http://localhost:%s/' % httpd.server_port, - 'http://localhost:%s/robots.txt' % httpd.server_port, - 'http://localhost:%s/file1.txt' % httpd.server_port} + 'http://localhost:%s/site1/' % httpd.server_port, + 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() captures_by_url = { - c['url']:c for c in captures if c['http_method'] != 'HEAD'} + c['url']: c for c in captures if c['http_method'] != 'HEAD'} + assert robots in captures_by_url assert page1 in captures_by_url - assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url @@ -221,14 +220,13 @@ def test_warcprox_selection(httpd): t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( - os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() - assert requests.get( - wb_url, allow_redirects=False).content == expected_payload + os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() + assert requests.get(wb_url).content == expected_payload def test_obey_robots(httpd): test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site( - seed='http://localhost:%s/' % httpd.server_port, + seed='http://localhost:%s/site1/' % httpd.server_port, proxy='localhost:8000', enable_warcprox_features=True, user_agent='im a badbot', # robots.txt blocks badbot warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) @@ -256,11 +254,11 @@ def test_obey_robots(httpd): site = frontier.site(site.id) assert site.status == 'FINISHED' - # check that we got the two pages we expected + # check that only the one page is in rethinkdb pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert {page.url for page in pages} == { - 'http://localhost:%s/' % httpd.server_port} + 'http://localhost:%s/site1/' % httpd.server_port} # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls @@ -276,3 +274,44 @@ def test_obey_robots(httpd): os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() assert requests.get( wb_url, allow_redirects=False).content == expected_payload + +def test_login(httpd): + test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() + site = brozzler.Site( + seed='http://localhost:%s/site2/' % httpd.server_port, + proxy='localhost:8000', enable_warcprox_features=True, + warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}, + username='test_username', password='test_password') + + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + + # the site should be brozzled fairly quickly + start = time.time() + while site.status != 'FINISHED' and time.time() - start < 300: + time.sleep(0.5) + site = frontier.site(site.id) + assert site.status == 'FINISHED' + + # take a look at the captures table + time.sleep(2) # in case warcprox hasn't finished processing urls + robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port + captures = list(r.table('captures').filter( + {'test_id':test_id}).order_by('timestamp').run()) + meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] + + # there are several forms in in htdocs/site2/login.html but only one + # that brozzler's heuristic should match and try to submit, and it has + # action='00', so we can check for that here + assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url + + # sanity check the rest of the crawl + assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url + assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url + assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url + assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url + assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url + assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url + assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url +