toward getting initial tests to pass

2025-04-19 15:25:59 -04:00 · 2016-10-14 18:26:48 -07:00 · 2016-10-14 18:26:48 -07:00 · 27452990ee
commit 27452990ee
parent 5a373466a3
3 changed files with 6 additions and 9 deletions
--- a/ansible/roles/pywb/templates/pywb.yml.j2
+++ b/ansible/roles/pywb/templates/pywb.yml.j2
@ -1,4 +1,4 @@
-archive_paths: {{warcs_dir}}
+archive_paths: {{warcs_dir}}/  # pywb will fail without a trailing slash
 collections:
  brozzler:
    index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b7.dev98',
+        version='1.1b7.dev99',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -111,7 +111,7 @@ def test_brozzle_site(httpd):

    # the site should be brozzled fairly quickly
    start = time.time()
-    while site.status != 'FINISHED' and time.time() - start < 120:
+    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'
@ -125,19 +125,16 @@ def test_brozzle_site(httpd):

    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
-    captures_by_url = {c['url']:c for c in captures if c['method'] != 'HEAD'}
+    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
-    assert 'youtube-dl:%s' % page1 in captures_by_url
-    assert 'youtube-dl:%s' % page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
-    assert 'screenshot:%s' % page2 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
-    assert 'thumbnail:%s' % page2 in captures_by_url
+    # no screenshots of plaintext

    # check pywb
-    t14 = captures_by_url[page2].timestamp.strftime('%Y%m%d%H%M%S')
+    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()