working on basic integration tests

This commit is contained in:
Noah Levitt 2016-10-13 17:12:35 -07:00
parent ed8b937277
commit 56e651baeb
3 changed files with 83 additions and 18 deletions

View file

@ -241,6 +241,15 @@ class RethinkDbFrontier:
else: else:
return None return None
def site(self, id):
if id is None:
return None
result = self.r.table("sites").get(id).run()
if result:
return brozzler.Site(**result)
else:
return None
def honor_stop_request(self, job_id): def honor_stop_request(self, job_id):
"""Raises brozzler.CrawlJobStopped if stop has been requested.""" """Raises brozzler.CrawlJobStopped if stop has been requested."""
job = self.job(job_id) job = self.job(job_id)

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b7.dev95', version='1.1b7.dev96',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View file

@ -1,8 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
''' '''
cluster-integration-tests.py - integration tests for a brozzler cluster, test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be warcprox, pywb, rethinkdb and other dependencies to be running already
running already
Copyright (C) 2016 Internet Archive Copyright (C) 2016 Internet Archive
@ -26,6 +25,10 @@ import urllib.request
import os import os
import socket import socket
import rethinkstuff import rethinkstuff
import time
import brozzler
import datetime
import requests
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def httpd(request): def httpd(request):
@ -53,13 +56,13 @@ def test_httpd(httpd):
''' '''
payload1 = content2 = None payload1 = content2 = None
with urllib.request.urlopen( with urllib.request.urlopen(
'http://localhost:%s/' % httpd.server_port) as response: 'http://localhost:%s/file1.txt' % httpd.server_port) as response:
assert response.status == 200 assert response.status == 200
payload1 = response.read() payload1 = response.read()
assert payload1 assert payload1
with urllib.request.urlopen( with urllib.request.urlopen(
'http://localhost:%s/' % httpd.server_port) as response: 'http://localhost:%s/file1.txt' % httpd.server_port) as response:
assert response.status == 200 assert response.status == 200
payload2 = response.read() payload2 = response.read()
assert payload2 assert payload2
@ -68,21 +71,74 @@ def test_httpd(httpd):
def test_services_up(): def test_services_up():
'''Check that the expected services are up and running.''' '''Check that the expected services are up and running.'''
# check that warcprox is listening
with socket.socket() as s:
# if the connect fails an exception is raised and the test fails
s.connect(('localhost', 8000))
### # check that pywb is listening
### with socket.socket() as s:
### # if the connect fails an exception is raised and the test fails
### s.connect(('localhost', 8880))
# check that rethinkdb is listening and looks sane # check that rethinkdb is listening and looks sane
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
tbls = r.table_list().run() tbls = r.table_list().run()
assert len(tbls) > 10 assert len(tbls) > 10
def test_brozzle_site(httpd): # check that warcprox is listening
pass with socket.socket() as s:
# if the connect fails an exception is raised and the test fails
s.connect(('localhost', 8000))
# check that pywb is listening
with socket.socket() as s:
# if the connect fails an exception is raised and the test fails
s.connect(('localhost', 8880))
# check that brozzler webconsole is listening
with socket.socket() as s:
# if the connect fails an exception is raised and the test fails
s.connect(('localhost', 8881))
def test_brozzle_site(httpd):
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
assert len(list(frontier.site_pages(site.id))) == 1
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 120:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port }
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {c['url']:c for c in captures if c['method'] != 'HEAD'}
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
assert 'youtube-dl:%s' % page1 in captures_by_url
assert 'youtube-dl:%s' % page2 in captures_by_url
assert 'screenshot:%s' % page1 in captures_by_url
assert 'screenshot:%s' % page2 in captures_by_url
assert 'thumbnail:%s' % page1 in captures_by_url
assert 'thumbnail:%s' % page2 in captures_by_url
# check pywb
t14 = captures_by_url[page2].timestamp.strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get().content == expected_payload