generalized support for login doing automatic detection of login form on a page

This commit is contained in:
Noah Levitt 2016-12-19 17:30:09 -08:00
parent bc6e0d243f
commit 86ac48d6c3
8 changed files with 213 additions and 48 deletions

View file

@ -62,14 +62,13 @@ def test_httpd(httpd):
deduplication.
'''
payload1 = content2 = None
with urllib.request.urlopen(
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload1 = response.read()
assert payload1
with urllib.request.urlopen(
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload2 = response.read()
assert payload2
@ -101,13 +100,14 @@ def test_services_up():
def test_brozzle_site(httpd):
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
seed='http://localhost:%s/site1/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
page1 = 'http://localhost:%s/site1/' % httpd.server_port
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
# so we can examine rethinkdb before it does anything
try:
@ -131,19 +131,18 @@ def test_brozzle_site(httpd):
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 3
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/robots.txt' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
'http://localhost:%s/site1/' % httpd.server_port,
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert robots in captures_by_url
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
assert 'screenshot:%s' % page1 in captures_by_url
assert 'thumbnail:%s' % page1 in captures_by_url
@ -153,7 +152,7 @@ def test_brozzle_site(httpd):
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
def test_warcprox_selection(httpd):
@ -163,11 +162,12 @@ def test_warcprox_selection(httpd):
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
page1 = 'http://localhost:%s/site1/' % httpd.server_port
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
seed='http://localhost:%s/site1/' % httpd.server_port,
enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
@ -199,19 +199,18 @@ def test_warcprox_selection(httpd):
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 3
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/robots.txt' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
'http://localhost:%s/site1/' % httpd.server_port,
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {
c['url']:c for c in captures if c['http_method'] != 'HEAD'}
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert robots in captures_by_url
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
assert 'screenshot:%s' % page1 in captures_by_url
assert 'thumbnail:%s' % page1 in captures_by_url
@ -221,14 +220,13 @@ def test_warcprox_selection(httpd):
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get(
wb_url, allow_redirects=False).content == expected_payload
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
def test_obey_robots(httpd):
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
seed='http://localhost:%s/site1/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
user_agent='im a badbot', # robots.txt blocks badbot
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
@ -256,11 +254,11 @@ def test_obey_robots(httpd):
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# check that we got the two pages we expected
# check that only the one page is in rethinkdb
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port}
'http://localhost:%s/site1/' % httpd.server_port}
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
@ -276,3 +274,44 @@ def test_obey_robots(httpd):
os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
assert requests.get(
wb_url, allow_redirects=False).content == expected_payload
def test_login(httpd):
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/site2/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}},
username='test_username', password='test_password')
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
captures = list(r.table('captures').filter(
{'test_id':test_id}).order_by('timestamp').run())
meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
# there are several forms in in htdocs/site2/login.html but only one
# that brozzler's heuristic should match and try to submit, and it has
# action='00', so we can check for that here
assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url
# sanity check the rest of the crawl
assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url