mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 15:25:59 -04:00
toward getting initial tests to pass
This commit is contained in:
parent
5a373466a3
commit
27452990ee
@ -1,4 +1,4 @@
|
||||
archive_paths: {{warcs_dir}}
|
||||
archive_paths: {{warcs_dir}}/ # pywb will fail without a trailing slash
|
||||
collections:
|
||||
brozzler:
|
||||
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b7.dev98',
|
||||
version='1.1b7.dev99',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -111,7 +111,7 @@ def test_brozzle_site(httpd):
|
||||
|
||||
# the site should be brozzled fairly quickly
|
||||
start = time.time()
|
||||
while site.status != 'FINISHED' and time.time() - start < 120:
|
||||
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||
time.sleep(0.5)
|
||||
site = frontier.site(site.id)
|
||||
assert site.status == 'FINISHED'
|
||||
@ -125,19 +125,16 @@ def test_brozzle_site(httpd):
|
||||
|
||||
# take a look at the captures table
|
||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||
captures_by_url = {c['url']:c for c in captures if c['method'] != 'HEAD'}
|
||||
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||
assert page1 in captures_by_url
|
||||
assert '%srobots.txt' % page1 in captures_by_url
|
||||
assert page2 in captures_by_url
|
||||
assert 'youtube-dl:%s' % page1 in captures_by_url
|
||||
assert 'youtube-dl:%s' % page2 in captures_by_url
|
||||
assert 'screenshot:%s' % page1 in captures_by_url
|
||||
assert 'screenshot:%s' % page2 in captures_by_url
|
||||
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||
assert 'thumbnail:%s' % page2 in captures_by_url
|
||||
# no screenshots of plaintext
|
||||
|
||||
# check pywb
|
||||
t14 = captures_by_url[page2].timestamp.strftime('%Y%m%d%H%M%S')
|
||||
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||
expected_payload = open(os.path.join(
|
||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||
|
Loading…
x
Reference in New Issue
Block a user