mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 15:25:59 -04:00
working on basic integration tests
This commit is contained in:
parent
ed8b937277
commit
56e651baeb
@ -241,6 +241,15 @@ class RethinkDbFrontier:
|
||||
else:
|
||||
return None
|
||||
|
||||
def site(self, id):
|
||||
if id is None:
|
||||
return None
|
||||
result = self.r.table("sites").get(id).run()
|
||||
if result:
|
||||
return brozzler.Site(**result)
|
||||
else:
|
||||
return None
|
||||
|
||||
def honor_stop_request(self, job_id):
|
||||
"""Raises brozzler.CrawlJobStopped if stop has been requested."""
|
||||
job = self.job(job_id)
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b7.dev95',
|
||||
version='1.1b7.dev96',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
cluster-integration-tests.py - integration tests for a brozzler cluster,
|
||||
expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
|
||||
running already
|
||||
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
|
||||
warcprox, pywb, rethinkdb and other dependencies to be running already
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
@ -26,6 +25,10 @@ import urllib.request
|
||||
import os
|
||||
import socket
|
||||
import rethinkstuff
|
||||
import time
|
||||
import brozzler
|
||||
import datetime
|
||||
import requests
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
@ -53,13 +56,13 @@ def test_httpd(httpd):
|
||||
'''
|
||||
payload1 = content2 = None
|
||||
with urllib.request.urlopen(
|
||||
'http://localhost:%s/' % httpd.server_port) as response:
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
||||
assert response.status == 200
|
||||
payload1 = response.read()
|
||||
assert payload1
|
||||
|
||||
with urllib.request.urlopen(
|
||||
'http://localhost:%s/' % httpd.server_port) as response:
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
||||
assert response.status == 200
|
||||
payload2 = response.read()
|
||||
assert payload2
|
||||
@ -68,21 +71,74 @@ def test_httpd(httpd):
|
||||
|
||||
def test_services_up():
|
||||
'''Check that the expected services are up and running.'''
|
||||
# check that warcprox is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8000))
|
||||
|
||||
### # check that pywb is listening
|
||||
### with socket.socket() as s:
|
||||
### # if the connect fails an exception is raised and the test fails
|
||||
### s.connect(('localhost', 8880))
|
||||
|
||||
# check that rethinkdb is listening and looks sane
|
||||
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
|
||||
tbls = r.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
def test_brozzle_site(httpd):
|
||||
pass
|
||||
# check that warcprox is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8000))
|
||||
|
||||
# check that pywb is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8880))
|
||||
|
||||
# check that brozzler webconsole is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8881))
|
||||
|
||||
def test_brozzle_site(httpd):
|
||||
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
|
||||
site = brozzler.Site(
|
||||
seed='http://localhost:%s/' % httpd.server_port,
|
||||
proxy='localhost:8000', enable_warcprox_features=True,
|
||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||
|
||||
# the two pages we expect to be crawled
|
||||
page1 = 'http://localhost:%s/' % httpd.server_port
|
||||
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
||||
|
||||
assert site.id is None
|
||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id is not None
|
||||
assert len(list(frontier.site_pages(site.id))) == 1
|
||||
|
||||
# the site should be brozzled fairly quickly
|
||||
start = time.time()
|
||||
while site.status != 'FINISHED' and time.time() - start < 120:
|
||||
time.sleep(0.5)
|
||||
site = frontier.site(site.id)
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
# check that we got the two pages we expected
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 2
|
||||
assert {page.url for page in pages} == {
|
||||
'http://localhost:%s/' % httpd.server_port,
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port }
|
||||
|
||||
# take a look at the captures table
|
||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||
captures_by_url = {c['url']:c for c in captures if c['method'] != 'HEAD'}
|
||||
assert page1 in captures_by_url
|
||||
assert '%srobots.txt' % page1 in captures_by_url
|
||||
assert page2 in captures_by_url
|
||||
assert 'youtube-dl:%s' % page1 in captures_by_url
|
||||
assert 'youtube-dl:%s' % page2 in captures_by_url
|
||||
assert 'screenshot:%s' % page1 in captures_by_url
|
||||
assert 'screenshot:%s' % page2 in captures_by_url
|
||||
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||
assert 'thumbnail:%s' % page2 in captures_by_url
|
||||
|
||||
# check pywb
|
||||
t14 = captures_by_url[page2].timestamp.strftime('%Y%m%d%H%M%S')
|
||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||
expected_payload = open(os.path.join(
|
||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||
assert requests.get().content == expected_payload
|
||||
|
Loading…
x
Reference in New Issue
Block a user