working on basic integration tests

2025-08-07 22:12:15 -04:00 · 2016-10-13 17:12:35 -07:00 · 2016-10-13 17:12:35 -07:00 · 56e651baeb
commit 56e651baeb
parent ed8b937277
3 changed files with 83 additions and 18 deletions
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -241,6 +241,15 @@ class RethinkDbFrontier:
        else:
            return None
    def site(self, id):
        if id is None:
            return None
        result = self.r.table("sites").get(id).run()
        if result:
            return brozzler.Site(**result)
        else:
            return None
    def honor_stop_request(self, job_id):
        """Raises brozzler.CrawlJobStopped if stop has been requested."""
        job = self.job(job_id)
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
 setuptools.setup(
        name='brozzler',
-        version='1.1b7.dev95',
+        version='1.1b7.dev96',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -1,8 +1,7 @@
 #!/usr/bin/env python
 '''
-cluster-integration-tests.py - integration tests for a brozzler cluster,
+test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
-expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
+warcprox, pywb, rethinkdb and other dependencies to be running already
 running already
 Copyright (C) 2016 Internet Archive
@ -26,6 +25,10 @@ import urllib.request
 import os
 import socket
 import rethinkstuff
 import time
 import brozzler
 import datetime
 import requests
@pytest.fixture(scope='module')
 def httpd(request):
@ -53,13 +56,13 @@ def test_httpd(httpd):
    '''
    payload1 = content2 = None
    with urllib.request.urlopen(
-            'http://localhost:%s/' % httpd.server_port) as response:
+            'http://localhost:%s/file1.txt' % httpd.server_port) as response:
        assert response.status == 200
        payload1 = response.read()
        assert payload1
    with urllib.request.urlopen(
-            'http://localhost:%s/' % httpd.server_port) as response:
+            'http://localhost:%s/file1.txt' % httpd.server_port) as response:
        assert response.status == 200
        payload2 = response.read()
        assert payload2
@ -68,21 +71,74 @@ def test_httpd(httpd):
 def test_services_up():
    '''Check that the expected services are up and running.'''
    # check that warcprox is listening
    with socket.socket() as s:
        # if the connect fails an exception is raised and the test fails
        s.connect(('localhost', 8000))
    ### # check that pywb is listening
    ### with socket.socket() as s:
    ###     # if the connect fails an exception is raised and the test fails
    ###     s.connect(('localhost', 8880))
    # check that rethinkdb is listening and looks sane
    r = rethinkstuff.Rethinker(db='rethinkdb')  # built-in db
    tbls = r.table_list().run()
    assert len(tbls) > 10
-def test_brozzle_site(httpd):
+    # check that warcprox is listening
-    pass
+    with socket.socket() as s:
        # if the connect fails an exception is raised and the test fails
        s.connect(('localhost', 8000))
    # check that pywb is listening
    with socket.socket() as s:
        # if the connect fails an exception is raised and the test fails
        s.connect(('localhost', 8880))
    # check that brozzler webconsole is listening
    with socket.socket() as s:
        # if the connect fails an exception is raised and the test fails
        s.connect(('localhost', 8881))
 def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(
            seed='http://localhost:%s/' % httpd.server_port,
            proxy='localhost:8000', enable_warcprox_features=True,
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/' % httpd.server_port
    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
    assert site.id is None
    r = rethinkstuff.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1
    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 120:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'
    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port }
    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {c['url']:c for c in captures if c['method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'youtube-dl:%s' % page1 in captures_by_url
    assert 'youtube-dl:%s' % page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'screenshot:%s' % page2 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page2 in captures_by_url
    # check pywb
    t14 = captures_by_url[page2].timestamp.strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get().content == expected_payload