mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
generalized support for login doing automatic detection of login form on a page
This commit is contained in:
parent
bc6e0d243f
commit
86ac48d6c3
8 changed files with 213 additions and 48 deletions
|
@ -44,7 +44,8 @@ class ReachedLimit(Exception):
|
||||||
self.http_payload = http_payload
|
self.http_payload = http_payload
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))
|
return "ReachedLimit(warcprox_meta=%s,http_payload=%s)" % (
|
||||||
|
repr(self.warcprox_meta), repr(self.http_payload))
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
@ -169,6 +170,16 @@ def sleep(duration):
|
||||||
break
|
break
|
||||||
time.sleep(min(duration - elapsed, 0.5))
|
time.sleep(min(duration - elapsed, 0.5))
|
||||||
|
|
||||||
|
_jinja2_env = None
|
||||||
|
def jinja2_environment():
|
||||||
|
global _jinja2_env
|
||||||
|
if not _jinja2_env:
|
||||||
|
import jinja2, json
|
||||||
|
_jinja2_env = jinja2.Environment(
|
||||||
|
loader=jinja2.PackageLoader('brozzler', 'js-templates'))
|
||||||
|
_jinja2_env.filters['json'] = json.dumps
|
||||||
|
return _jinja2_env
|
||||||
|
|
||||||
from brozzler.site import Page, Site
|
from brozzler.site import Page, Site
|
||||||
from brozzler.worker import BrozzlerWorker
|
from brozzler.worker import BrozzlerWorker
|
||||||
from brozzler.robots import is_permitted_by_robots
|
from brozzler.robots import is_permitted_by_robots
|
||||||
|
|
|
@ -179,7 +179,6 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
|
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
|
||||||
|
|
||||||
def _handle_message(self, websock, json_message):
|
def _handle_message(self, websock, json_message):
|
||||||
self.logger.debug("%s", json_message)
|
|
||||||
message = json.loads(json_message)
|
message = json.loads(json_message)
|
||||||
if 'method' in message:
|
if 'method' in message:
|
||||||
if message['method'] == 'Page.loadEventFired':
|
if message['method'] == 'Page.loadEventFired':
|
||||||
|
@ -191,16 +190,15 @@ class WebsockReceiverThread(threading.Thread):
|
||||||
'%s console.%s %s', self.websock.url,
|
'%s console.%s %s', self.websock.url,
|
||||||
message['params']['message']['level'],
|
message['params']['message']['level'],
|
||||||
message['params']['message']['text'])
|
message['params']['message']['text'])
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s %s", message["method"], json_message)
|
# self.logger.debug("%s %s", message["method"], json_message)
|
||||||
elif 'result' in message:
|
elif 'result' in message:
|
||||||
if message['id'] in self._result_messages:
|
if message['id'] in self._result_messages:
|
||||||
self._result_messages[message['id']] = message
|
self._result_messages[message['id']] = message
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s", json_message)
|
# self.logger.debug("%s", json_message)
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("%s", json_message)
|
# self.logger.debug("%s", json_message)
|
||||||
|
|
||||||
|
|
||||||
class Browser:
|
class Browser:
|
||||||
'''
|
'''
|
||||||
|
@ -235,7 +233,6 @@ class Browser:
|
||||||
'''
|
'''
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while True:
|
while True:
|
||||||
brozzler.sleep(0.5)
|
|
||||||
if callback():
|
if callback():
|
||||||
return
|
return
|
||||||
elapsed = time.time() - start
|
elapsed = time.time() - start
|
||||||
|
@ -243,6 +240,7 @@ class Browser:
|
||||||
raise BrowsingTimeout(
|
raise BrowsingTimeout(
|
||||||
'timed out after %.1fs waiting for: %s' % (
|
'timed out after %.1fs waiting for: %s' % (
|
||||||
elapsed, callback))
|
elapsed, callback))
|
||||||
|
brozzler.sleep(0.5)
|
||||||
|
|
||||||
def send_to_chrome(self, suppress_logging=False, **kwargs):
|
def send_to_chrome(self, suppress_logging=False, **kwargs):
|
||||||
msg_id = next(self._command_id)
|
msg_id = next(self._command_id)
|
||||||
|
@ -329,7 +327,8 @@ class Browser:
|
||||||
def browse_page(
|
def browse_page(
|
||||||
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
||||||
user_agent=None, behavior_parameters=None,
|
user_agent=None, behavior_parameters=None,
|
||||||
on_request=None, on_response=None, on_screenshot=None):
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
|
username=None, password=None):
|
||||||
'''
|
'''
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
|
@ -373,9 +372,11 @@ class Browser:
|
||||||
raise BrowsingException('browser is already busy browsing a page')
|
raise BrowsingException('browser is already busy browsing a page')
|
||||||
self.is_browsing = True
|
self.is_browsing = True
|
||||||
try:
|
try:
|
||||||
self.navigate_to_page(page_url, timeout=300)
|
self.navigate_to_page(
|
||||||
## if login_credentials:
|
page_url, extra_headers=extra_headers,
|
||||||
## self.try_login(login_credentials) (5 min?)
|
user_agent=user_agent, timeout=300)
|
||||||
|
if password:
|
||||||
|
self.try_login(username, password, timeout=300)
|
||||||
behavior_script = brozzler.behavior_script(
|
behavior_script = brozzler.behavior_script(
|
||||||
page_url, behavior_parameters)
|
page_url, behavior_parameters)
|
||||||
self.run_behavior(behavior_script, timeout=900)
|
self.run_behavior(behavior_script, timeout=900)
|
||||||
|
@ -450,7 +451,7 @@ __brzl_compileOutlinks(window).join('\n');
|
||||||
if message['result']['result']['value']:
|
if message['result']['result']['value']:
|
||||||
return frozenset(message['result']['result']['value'].split('\n'))
|
return frozenset(message['result']['result']['value'].split('\n'))
|
||||||
else:
|
else:
|
||||||
self._outlinks = frozenset()
|
return frozenset()
|
||||||
|
|
||||||
def screenshot(self, timeout=30):
|
def screenshot(self, timeout=30):
|
||||||
self.logger.info('taking screenshot')
|
self.logger.info('taking screenshot')
|
||||||
|
@ -523,6 +524,101 @@ __brzl_compileOutlinks(window).join('\n');
|
||||||
except BrowsingTimeout:
|
except BrowsingTimeout:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
TRY_LOGIN_JS_J2 = '''
|
||||||
|
var __brzl_tryLoginState = 'trying';
|
||||||
|
|
||||||
|
var __brzl_tryLogin = function() {
|
||||||
|
for (var i = 0; i < document.forms.length; i++) {
|
||||||
|
var form = document.forms[i];
|
||||||
|
if (form.method != 'post') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
var usernameField, passwordField;
|
||||||
|
for (var j = 0; j < form.elements.length; j++) {
|
||||||
|
var field = form.elements[j];
|
||||||
|
if (field.type == 'text') {
|
||||||
|
if (!usernameField) {
|
||||||
|
usernameField = field;
|
||||||
|
} else {
|
||||||
|
usernameField = undefined;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else if (field.type == 'password') {
|
||||||
|
if (!passwordField) {
|
||||||
|
passwordField = field;
|
||||||
|
} else {
|
||||||
|
passwordField = undefined;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else if (field.type == 'textarea') {
|
||||||
|
usernameField = undefined;
|
||||||
|
passwordField = undefined;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if (usernameField && passwordField) {
|
||||||
|
usernameField.value = {{username|json}};
|
||||||
|
passwordField.value = {{password|json}};
|
||||||
|
form.submit()
|
||||||
|
__brzl_tryLoginState = 'submitted-form';
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__brzl_tryLoginState = 'login-form-not-found';
|
||||||
|
};
|
||||||
|
|
||||||
|
__brzl_tryLogin();
|
||||||
|
'''
|
||||||
|
def try_login(self, username, password, timeout=300):
|
||||||
|
try_login_js = brozzler.jinja2_environment().from_string(
|
||||||
|
self.TRY_LOGIN_JS_J2).render(
|
||||||
|
username=username, password=password)
|
||||||
|
|
||||||
|
self.websock_thread.got_page_load_event = None
|
||||||
|
self.send_to_chrome(
|
||||||
|
method='Runtime.evaluate', suppress_logging=True,
|
||||||
|
params={'expression': try_login_js})
|
||||||
|
|
||||||
|
# wait for tryLogin to finish trying (should be very very quick)
|
||||||
|
start = time.time()
|
||||||
|
while True:
|
||||||
|
self.websock_thread.expect_result(self._command_id.peek())
|
||||||
|
msg_id = self.send_to_chrome(
|
||||||
|
method='Runtime.evaluate',
|
||||||
|
params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'})
|
||||||
|
try:
|
||||||
|
self._wait_for(
|
||||||
|
lambda: self.websock_thread.received_result(msg_id),
|
||||||
|
timeout=5)
|
||||||
|
msg = self.websock_thread.pop_result(msg_id)
|
||||||
|
if (msg and 'result' in msg
|
||||||
|
and 'result' in msg['result']):
|
||||||
|
result = msg['result']['result']['value']
|
||||||
|
if result == 'login-form-not-found':
|
||||||
|
# we're done
|
||||||
|
return
|
||||||
|
elif result in ('submitted-form', 'maybe-submitted-form'):
|
||||||
|
# wait for page load event below
|
||||||
|
self.logger.info(
|
||||||
|
'submitted a login form, waiting for another '
|
||||||
|
'page load event')
|
||||||
|
break
|
||||||
|
# else try again to get __brzl_tryLoginState
|
||||||
|
|
||||||
|
except BrowsingTimeout:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if time.time() - start > 30:
|
||||||
|
raise BrowsingException(
|
||||||
|
'timed out trying to check if tryLogin finished')
|
||||||
|
|
||||||
|
# if we get here, we submitted a form, now we wait for another page
|
||||||
|
# load event
|
||||||
|
self._wait_for(
|
||||||
|
lambda: self.websock_thread.got_page_load_event,
|
||||||
|
timeout=timeout)
|
||||||
|
|
||||||
class Counter:
|
class Counter:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.next_value = 0
|
self.next_value = 0
|
||||||
|
|
|
@ -126,6 +126,12 @@ def brozzle_page():
|
||||||
'json blob of parameters to populate the javascript behavior '
|
'json blob of parameters to populate the javascript behavior '
|
||||||
'template, e.g. {"parameter_username":"x",'
|
'template, e.g. {"parameter_username":"x",'
|
||||||
'"parameter_password":"y"}'))
|
'"parameter_password":"y"}'))
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--username', dest='username', default=None,
|
||||||
|
help='use this username to try to log in if a login form is found')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--password', dest='password', default=None,
|
||||||
|
help='use this password to try to log in if a login form is found')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--proxy', dest='proxy', default=None,
|
'--proxy', dest='proxy', default=None,
|
||||||
help='http proxy')
|
help='http proxy')
|
||||||
|
@ -145,7 +151,8 @@ def brozzle_page():
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
id=-1, seed=args.url, proxy=args.proxy,
|
id=-1, seed=args.url, proxy=args.proxy,
|
||||||
enable_warcprox_features=args.enable_warcprox_features,
|
enable_warcprox_features=args.enable_warcprox_features,
|
||||||
behavior_parameters=behavior_parameters)
|
behavior_parameters=behavior_parameters, username=args.username,
|
||||||
|
password=args.password)
|
||||||
page = brozzler.Page(url=args.url, site_id=site.id)
|
page = brozzler.Page(url=args.url, site_id=site.id)
|
||||||
worker = brozzler.BrozzlerWorker(frontier=None)
|
worker = brozzler.BrozzlerWorker(frontier=None)
|
||||||
|
|
||||||
|
@ -230,6 +237,12 @@ def brozzler_new_site():
|
||||||
'json blob of parameters to populate the javascript behavior '
|
'json blob of parameters to populate the javascript behavior '
|
||||||
'template, e.g. {"parameter_username":"x",'
|
'template, e.g. {"parameter_username":"x",'
|
||||||
'"parameter_password":"y"}'))
|
'"parameter_password":"y"}'))
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--username', dest='username', default=None,
|
||||||
|
help='use this username to try to log in if a login form is found')
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--password', dest='password', default=None,
|
||||||
|
help='use this password to try to log in if a login form is found')
|
||||||
_add_common_options(arg_parser)
|
_add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
@ -243,7 +256,8 @@ def brozzler_new_site():
|
||||||
warcprox_meta=json.loads(
|
warcprox_meta=json.loads(
|
||||||
args.warcprox_meta) if args.warcprox_meta else None,
|
args.warcprox_meta) if args.warcprox_meta else None,
|
||||||
behavior_parameters=json.loads(
|
behavior_parameters=json.loads(
|
||||||
args.behavior_parameters) if args.behavior_parameters else None)
|
args.behavior_parameters) if args.behavior_parameters else None,
|
||||||
|
username=args.username, password=args.password)
|
||||||
|
|
||||||
r = rethinkstuff.Rethinker(
|
r = rethinkstuff.Rethinker(
|
||||||
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
args.rethinkdb_servers.split(","), args.rethinkdb_db)
|
||||||
|
|
|
@ -95,7 +95,8 @@ class Site(brozzler.BaseDictable):
|
||||||
status="ACTIVE", claimed=False, start_time=None,
|
status="ACTIVE", claimed=False, start_time=None,
|
||||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
||||||
cookie_db=None, user_agent=None, behavior_parameters=None):
|
cookie_db=None, user_agent=None, behavior_parameters=None,
|
||||||
|
username=None, password=None):
|
||||||
|
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
|
@ -117,6 +118,8 @@ class Site(brozzler.BaseDictable):
|
||||||
self.cookie_db = cookie_db
|
self.cookie_db = cookie_db
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
self.behavior_parameters = behavior_parameters
|
self.behavior_parameters = behavior_parameters
|
||||||
|
self.username = username
|
||||||
|
self.password = password
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
|
|
|
@ -277,6 +277,7 @@ class BrozzlerWorker:
|
||||||
final_page_url, outlinks = browser.browse_page(
|
final_page_url, outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(),
|
page.url, extra_headers=site.extra_headers(),
|
||||||
behavior_parameters=site.behavior_parameters,
|
behavior_parameters=site.behavior_parameters,
|
||||||
|
username=site.username, password=site.password,
|
||||||
user_agent=site.user_agent,
|
user_agent=site.user_agent,
|
||||||
on_screenshot=_on_screenshot)
|
on_screenshot=_on_screenshot)
|
||||||
if final_page_url != page.url:
|
if final_page_url != page.url:
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev145',
|
version='1.1b9.dev146',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -69,6 +69,7 @@ setuptools.setup(
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'psutil==4.3.0',
|
'psutil==4.3.0',
|
||||||
'cerberus==1.0.1',
|
'cerberus==1.0.1',
|
||||||
|
'jinja2',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||||
|
@ -80,6 +81,7 @@ setuptools.setup(
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
'Programming Language :: Python :: 3.4',
|
'Programming Language :: Python :: 3.4',
|
||||||
|
'Programming Language :: Python :: 3.5',
|
||||||
'Topic :: Internet :: WWW/HTTP',
|
'Topic :: Internet :: WWW/HTTP',
|
||||||
'Topic :: System :: Archiving',
|
'Topic :: System :: Archiving',
|
||||||
])
|
])
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
I'm a plain text file.
|
|
|
@ -62,14 +62,13 @@ def test_httpd(httpd):
|
||||||
deduplication.
|
deduplication.
|
||||||
'''
|
'''
|
||||||
payload1 = content2 = None
|
payload1 = content2 = None
|
||||||
with urllib.request.urlopen(
|
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
||||||
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
with urllib.request.urlopen(url) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
payload1 = response.read()
|
payload1 = response.read()
|
||||||
assert payload1
|
assert payload1
|
||||||
|
|
||||||
with urllib.request.urlopen(
|
with urllib.request.urlopen(url) as response:
|
||||||
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
payload2 = response.read()
|
payload2 = response.read()
|
||||||
assert payload2
|
assert payload2
|
||||||
|
@ -101,13 +100,14 @@ def test_services_up():
|
||||||
def test_brozzle_site(httpd):
|
def test_brozzle_site(httpd):
|
||||||
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
seed='http://localhost:%s/' % httpd.server_port,
|
seed='http://localhost:%s/site1/' % httpd.server_port,
|
||||||
proxy='localhost:8000', enable_warcprox_features=True,
|
proxy='localhost:8000', enable_warcprox_features=True,
|
||||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||||
|
|
||||||
# the two pages we expect to be crawled
|
# the two pages we expect to be crawled
|
||||||
page1 = 'http://localhost:%s/' % httpd.server_port
|
page1 = 'http://localhost:%s/site1/' % httpd.server_port
|
||||||
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
||||||
|
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
|
||||||
|
|
||||||
# so we can examine rethinkdb before it does anything
|
# so we can examine rethinkdb before it does anything
|
||||||
try:
|
try:
|
||||||
|
@ -131,19 +131,18 @@ def test_brozzle_site(httpd):
|
||||||
|
|
||||||
# check that we got the two pages we expected
|
# check that we got the two pages we expected
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 3
|
assert len(pages) == 2
|
||||||
assert {page.url for page in pages} == {
|
assert {page.url for page in pages} == {
|
||||||
'http://localhost:%s/' % httpd.server_port,
|
'http://localhost:%s/site1/' % httpd.server_port,
|
||||||
'http://localhost:%s/robots.txt' % httpd.server_port,
|
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
|
||||||
'http://localhost:%s/file1.txt' % httpd.server_port}
|
|
||||||
|
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||||
captures_by_url = {
|
captures_by_url = {
|
||||||
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
|
assert robots in captures_by_url
|
||||||
assert page1 in captures_by_url
|
assert page1 in captures_by_url
|
||||||
assert '%srobots.txt' % page1 in captures_by_url
|
|
||||||
assert page2 in captures_by_url
|
assert page2 in captures_by_url
|
||||||
assert 'screenshot:%s' % page1 in captures_by_url
|
assert 'screenshot:%s' % page1 in captures_by_url
|
||||||
assert 'thumbnail:%s' % page1 in captures_by_url
|
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||||
|
@ -153,7 +152,7 @@ def test_brozzle_site(httpd):
|
||||||
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||||
expected_payload = open(os.path.join(
|
expected_payload = open(os.path.join(
|
||||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
|
||||||
assert requests.get(wb_url).content == expected_payload
|
assert requests.get(wb_url).content == expected_payload
|
||||||
|
|
||||||
def test_warcprox_selection(httpd):
|
def test_warcprox_selection(httpd):
|
||||||
|
@ -163,11 +162,12 @@ def test_warcprox_selection(httpd):
|
||||||
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
|
||||||
# the two pages we expect to be crawled
|
# the two pages we expect to be crawled
|
||||||
page1 = 'http://localhost:%s/' % httpd.server_port
|
page1 = 'http://localhost:%s/site1/' % httpd.server_port
|
||||||
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
||||||
|
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
|
||||||
|
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
seed='http://localhost:%s/' % httpd.server_port,
|
seed='http://localhost:%s/site1/' % httpd.server_port,
|
||||||
enable_warcprox_features=True,
|
enable_warcprox_features=True,
|
||||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||||
|
|
||||||
|
@ -199,19 +199,18 @@ def test_warcprox_selection(httpd):
|
||||||
|
|
||||||
# check that we got the two pages we expected
|
# check that we got the two pages we expected
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 3
|
assert len(pages) == 2
|
||||||
assert {page.url for page in pages} == {
|
assert {page.url for page in pages} == {
|
||||||
'http://localhost:%s/' % httpd.server_port,
|
'http://localhost:%s/site1/' % httpd.server_port,
|
||||||
'http://localhost:%s/robots.txt' % httpd.server_port,
|
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
|
||||||
'http://localhost:%s/file1.txt' % httpd.server_port}
|
|
||||||
|
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||||
captures_by_url = {
|
captures_by_url = {
|
||||||
c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
|
assert robots in captures_by_url
|
||||||
assert page1 in captures_by_url
|
assert page1 in captures_by_url
|
||||||
assert '%srobots.txt' % page1 in captures_by_url
|
|
||||||
assert page2 in captures_by_url
|
assert page2 in captures_by_url
|
||||||
assert 'screenshot:%s' % page1 in captures_by_url
|
assert 'screenshot:%s' % page1 in captures_by_url
|
||||||
assert 'thumbnail:%s' % page1 in captures_by_url
|
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||||
|
@ -221,14 +220,13 @@ def test_warcprox_selection(httpd):
|
||||||
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||||
expected_payload = open(os.path.join(
|
expected_payload = open(os.path.join(
|
||||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
|
||||||
assert requests.get(
|
assert requests.get(wb_url).content == expected_payload
|
||||||
wb_url, allow_redirects=False).content == expected_payload
|
|
||||||
|
|
||||||
def test_obey_robots(httpd):
|
def test_obey_robots(httpd):
|
||||||
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
seed='http://localhost:%s/' % httpd.server_port,
|
seed='http://localhost:%s/site1/' % httpd.server_port,
|
||||||
proxy='localhost:8000', enable_warcprox_features=True,
|
proxy='localhost:8000', enable_warcprox_features=True,
|
||||||
user_agent='im a badbot', # robots.txt blocks badbot
|
user_agent='im a badbot', # robots.txt blocks badbot
|
||||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||||
|
@ -256,11 +254,11 @@ def test_obey_robots(httpd):
|
||||||
site = frontier.site(site.id)
|
site = frontier.site(site.id)
|
||||||
assert site.status == 'FINISHED'
|
assert site.status == 'FINISHED'
|
||||||
|
|
||||||
# check that we got the two pages we expected
|
# check that only the one page is in rethinkdb
|
||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 1
|
assert len(pages) == 1
|
||||||
assert {page.url for page in pages} == {
|
assert {page.url for page in pages} == {
|
||||||
'http://localhost:%s/' % httpd.server_port}
|
'http://localhost:%s/site1/' % httpd.server_port}
|
||||||
|
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
|
@ -276,3 +274,44 @@ def test_obey_robots(httpd):
|
||||||
os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
|
||||||
assert requests.get(
|
assert requests.get(
|
||||||
wb_url, allow_redirects=False).content == expected_payload
|
wb_url, allow_redirects=False).content == expected_payload
|
||||||
|
|
||||||
|
def test_login(httpd):
|
||||||
|
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
site = brozzler.Site(
|
||||||
|
seed='http://localhost:%s/site2/' % httpd.server_port,
|
||||||
|
proxy='localhost:8000', enable_warcprox_features=True,
|
||||||
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}},
|
||||||
|
username='test_username', password='test_password')
|
||||||
|
|
||||||
|
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
|
# the site should be brozzled fairly quickly
|
||||||
|
start = time.time()
|
||||||
|
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||||
|
time.sleep(0.5)
|
||||||
|
site = frontier.site(site.id)
|
||||||
|
assert site.status == 'FINISHED'
|
||||||
|
|
||||||
|
# take a look at the captures table
|
||||||
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
|
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
|
||||||
|
captures = list(r.table('captures').filter(
|
||||||
|
{'test_id':test_id}).order_by('timestamp').run())
|
||||||
|
meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
|
||||||
|
|
||||||
|
# there are several forms in in htdocs/site2/login.html but only one
|
||||||
|
# that brozzler's heuristic should match and try to submit, and it has
|
||||||
|
# action='00', so we can check for that here
|
||||||
|
assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url
|
||||||
|
|
||||||
|
# sanity check the rest of the crawl
|
||||||
|
assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
|
||||||
|
assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
|
||||||
|
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
|
||||||
|
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
|
||||||
|
assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
||||||
|
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
||||||
|
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue