Use warcprox if enable_warcprox_features is true

2025-08-11 07:50:24 -04:00 · 2016-10-18 17:39:33 -07:00 · 2016-10-18 17:39:33 -07:00 · 2215aaab21
commit 2215aaab21
parent a370e7b987
2 changed files with 79 additions and 5 deletions
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -121,7 +121,68 @@ def test_brozzle_site(httpd):
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
-            'http://localhost:%s/file1.txt' % httpd.server_port }
+            'http://localhost:%s/file1.txt' % httpd.server_port}
+
+    # take a look at the captures table
+    captures = r.table('captures').filter({'test_id':test_id}).run()
+    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
+    assert page1 in captures_by_url
+    assert '%srobots.txt' % page1 in captures_by_url
+    assert page2 in captures_by_url
+    assert 'screenshot:%s' % page1 in captures_by_url
+    assert 'thumbnail:%s' % page1 in captures_by_url
+    # no screenshots of plaintext
+
+    # check pywb
+    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
+    expected_payload = open(os.path.join(
+        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
+    assert requests.get(wb_url).content == expected_payload
+
+
+def test_warcprox_selection(httpd):
+    ''' When enable_warcprox_features is true, brozzler is expected to choose
+    and instance of warcprox '''
+
+    test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
+
+    # the two pages we expect to be crawled
+    page1 = 'http://localhost:%s/' % httpd.server_port
+    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
+
+    site = brozzler.Site(
+            seed='http://localhost:%s/' % httpd.server_port,
+            enable_warcprox_features=True,
+            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
+
+    assert site.id is None
+    r = rethinkstuff.Rethinker('localhost', db='brozzler')
+    frontier = brozzler.RethinkDbFrontier(r)
+    brozzler.new_site(frontier, site)
+    assert site.id is not None
+    assert len(list(frontier.site_pages(site.id))) == 1
+
+    # check proxy is set in rethink
+    start = time.time()
+    while not site.proxy and time.time() - start < 20:
+        time.sleep(0.5)
+        site = frontier.site(site.id)
+    assert site.proxy[-5:] == ':8000'
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site = frontier.site(site.id)
+    assert site.status == 'FINISHED'
+
+    # check that we got the two pages we expected
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 2
+    assert {page.url for page in pages} == {
+            'http://localhost:%s/' % httpd.server_port,
+            'http://localhost:%s/file1.txt' % httpd.server_port}

    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()