mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-10-11 12:58:39 -04:00
toward getting initial tests to pass
This commit is contained in:
parent
5a373466a3
commit
27452990ee
3 changed files with 6 additions and 9 deletions
|
@ -1,4 +1,4 @@
|
||||||
archive_paths: {{warcs_dir}}
|
archive_paths: {{warcs_dir}}/ # pywb will fail without a trailing slash
|
||||||
collections:
|
collections:
|
||||||
brozzler:
|
brozzler:
|
||||||
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b7.dev98',
|
version='1.1b7.dev99',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -111,7 +111,7 @@ def test_brozzle_site(httpd):
|
||||||
|
|
||||||
# the site should be brozzled fairly quickly
|
# the site should be brozzled fairly quickly
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while site.status != 'FINISHED' and time.time() - start < 120:
|
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
site = frontier.site(site.id)
|
site = frontier.site(site.id)
|
||||||
assert site.status == 'FINISHED'
|
assert site.status == 'FINISHED'
|
||||||
|
@ -125,19 +125,16 @@ def test_brozzle_site(httpd):
|
||||||
|
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||||
captures_by_url = {c['url']:c for c in captures if c['method'] != 'HEAD'}
|
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
assert page1 in captures_by_url
|
assert page1 in captures_by_url
|
||||||
assert '%srobots.txt' % page1 in captures_by_url
|
assert '%srobots.txt' % page1 in captures_by_url
|
||||||
assert page2 in captures_by_url
|
assert page2 in captures_by_url
|
||||||
assert 'youtube-dl:%s' % page1 in captures_by_url
|
|
||||||
assert 'youtube-dl:%s' % page2 in captures_by_url
|
|
||||||
assert 'screenshot:%s' % page1 in captures_by_url
|
assert 'screenshot:%s' % page1 in captures_by_url
|
||||||
assert 'screenshot:%s' % page2 in captures_by_url
|
|
||||||
assert 'thumbnail:%s' % page1 in captures_by_url
|
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||||
assert 'thumbnail:%s' % page2 in captures_by_url
|
# no screenshots of plaintext
|
||||||
|
|
||||||
# check pywb
|
# check pywb
|
||||||
t14 = captures_by_url[page2].timestamp.strftime('%Y%m%d%H%M%S')
|
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||||
expected_payload = open(os.path.join(
|
expected_payload = open(os.path.join(
|
||||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue