working on basic integration tests

2025-04-19 15:25:59 -04:00 · 2016-10-13 17:12:35 -07:00 · 2016-10-13 17:12:35 -07:00 · 56e651baeb
commit 56e651baeb
parent ed8b937277
3 changed files with 83 additions and 18 deletions
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -241,6 +241,15 @@ class RethinkDbFrontier:
        else:
            return None

+    def site(self, id):
+        if id is None:
+            return None
+        result = self.r.table("sites").get(id).run()
+        if result:
+            return brozzler.Site(**result)
+        else:
+            return None
+
    def honor_stop_request(self, job_id):
        """Raises brozzler.CrawlJobStopped if stop has been requested."""
        job = self.job(job_id)
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b7.dev95',
+        version='1.1b7.dev96',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -1,8 +1,7 @@
 #!/usr/bin/env python
 '''
-cluster-integration-tests.py - integration tests for a brozzler cluster,
-expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
-running already
+test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
+warcprox, pywb, rethinkdb and other dependencies to be running already

 Copyright (C) 2016 Internet Archive

@ -26,6 +25,10 @@ import urllib.request
 import os
 import socket
 import rethinkstuff
+import time
+import brozzler
+import datetime
+import requests

@pytest.fixture(scope='module')
 def httpd(request):
@ -53,13 +56,13 @@ def test_httpd(httpd):
    '''
    payload1 = content2 = None
    with urllib.request.urlopen(
-            'http://localhost:%s/' % httpd.server_port) as response:
+            'http://localhost:%s/file1.txt' % httpd.server_port) as response:
        assert response.status == 200
        payload1 = response.read()
        assert payload1

    with urllib.request.urlopen(
-            'http://localhost:%s/' % httpd.server_port) as response:
+            'http://localhost:%s/file1.txt' % httpd.server_port) as response:
        assert response.status == 200
        payload2 = response.read()
        assert payload2
@ -68,21 +71,74 @@ def test_httpd(httpd):

 def test_services_up():
    '''Check that the expected services are up and running.'''
-    # check that warcprox is listening
-    with socket.socket() as s:
-        # if the connect fails an exception is raised and the test fails
-        s.connect(('localhost', 8000))
-
-    ### # check that pywb is listening
-    ### with socket.socket() as s:
-    ###     # if the connect fails an exception is raised and the test fails
-    ###     s.connect(('localhost', 8880))
-
    # check that rethinkdb is listening and looks sane
    r = rethinkstuff.Rethinker(db='rethinkdb')  # built-in db
    tbls = r.table_list().run()
    assert len(tbls) > 10

-def test_brozzle_site(httpd):
-    pass
+    # check that warcprox is listening
+    with socket.socket() as s:
+        # if the connect fails an exception is raised and the test fails
+        s.connect(('localhost', 8000))

+    # check that pywb is listening
+    with socket.socket() as s:
+        # if the connect fails an exception is raised and the test fails
+        s.connect(('localhost', 8880))
+
+    # check that brozzler webconsole is listening
+    with socket.socket() as s:
+        # if the connect fails an exception is raised and the test fails
+        s.connect(('localhost', 8881))
+
+def test_brozzle_site(httpd):
+    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
+    site = brozzler.Site(
+            seed='http://localhost:%s/' % httpd.server_port,
+            proxy='localhost:8000', enable_warcprox_features=True,
+            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
+
+    # the two pages we expect to be crawled
+    page1 = 'http://localhost:%s/' % httpd.server_port
+    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
+
+    assert site.id is None
+    r = rethinkstuff.Rethinker('localhost', db='brozzler')
+    frontier = brozzler.RethinkDbFrontier(r)
+    brozzler.new_site(frontier, site)
+    assert site.id is not None
+    assert len(list(frontier.site_pages(site.id))) == 1
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 120:
+        time.sleep(0.5)
+        site = frontier.site(site.id)
+    assert site.status == 'FINISHED'
+
+    # check that we got the two pages we expected
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 2
+    assert {page.url for page in pages} == {
+            'http://localhost:%s/' % httpd.server_port,
+            'http://localhost:%s/file1.txt' % httpd.server_port }
+
+    # take a look at the captures table
+    captures = r.table('captures').filter({'test_id':test_id}).run()
+    captures_by_url = {c['url']:c for c in captures if c['method'] != 'HEAD'}
+    assert page1 in captures_by_url
+    assert '%srobots.txt' % page1 in captures_by_url
+    assert page2 in captures_by_url
+    assert 'youtube-dl:%s' % page1 in captures_by_url
+    assert 'youtube-dl:%s' % page2 in captures_by_url
+    assert 'screenshot:%s' % page1 in captures_by_url
+    assert 'screenshot:%s' % page2 in captures_by_url
+    assert 'thumbnail:%s' % page1 in captures_by_url
+    assert 'thumbnail:%s' % page2 in captures_by_url
+
+    # check pywb
+    t14 = captures_by_url[page2].timestamp.strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
+    expected_payload = open(os.path.join(
+        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
+    assert requests.get().content == expected_payload