fix robots.txt proxy down test by setting site.id (cached robots is stored by site.id, and other tests that ran earlier with no site.id were interfering); and test another kind of connection error, for whatever that's worth

2025-09-23 22:24:52 -04:00 · 2017-04-18 12:00:23 -07:00 · 2017-04-18 12:00:23 -07:00 · ac972d399f
commit ac972d399f
parent dc43794363
3 changed files with 45 additions and 42 deletions
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b11.dev233',
+        version='1.1b11.dev234',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -27,6 +27,7 @@ import argparse
 import urllib
 import json
 import threading
+import socket

 args = argparse.Namespace()
 args.log_level = logging.INFO
@ -192,23 +193,23 @@ def test_proxy_down():
    Test that browsing raises `brozzler.ProxyError` when proxy is down.

    See also `test_proxy_down` in test_units.py.
+
+    Tests two different kinds of connection error:
+    - nothing listening the port (nobody listens on on port 4 :))
+    - port bound but not accepting connections
    '''
-    site = brozzler.Site(None, {'seed':'http://example.com/'})
-    page = brozzler.Page(None, {'url': 'http://example.com/'})
+    sock = socket.socket()
+    sock.bind(('127.0.0.1', 0))
+    for not_listening_proxy in (
+            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
+        site = brozzler.Site(None, {'seed':'http://example.com/'})
+        page = brozzler.Page(None, {'url': 'http://example.com/'})

-    # nobody listens on port 4 :)
-    not_listening_proxy = '127.0.0.1:4'
+        worker = brozzler.BrozzlerWorker(
+                frontier=None, proxy=not_listening_proxy)
+        chrome_exe = brozzler.suggest_default_chrome_exe()

-    ### binding and not listening produces another type of connection
-    ### error, which we could test, but it takes a while
-    # sock = socket.socket()
-    # sock.bind(('127.0.0.1', 0))
-    # not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
-
-    worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
-    chrome_exe = brozzler.suggest_default_chrome_exe()
-
-    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
-        with pytest.raises(brozzler.ProxyError):
-            worker.brozzle_page(browser, site, page)
+        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
+            with pytest.raises(brozzler.ProxyError):
+                worker.brozzle_page(browser, site, page)

--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -28,6 +28,8 @@ import yaml
 import datetime
 import requests
 import tempfile
+import uuid
+import socket

@pytest.fixture(scope='module')
 def httpd(request):
@ -114,35 +116,35 @@ def test_proxy_down():

    This test needs to cover every possible fetch through the proxy other than
    fetches from the browser. For that, see test_brozzling.py.
+
+    Tests two different kinds of connection error:
+    - nothing listening the port (nobody listens on on port 4 :))
+    - port bound but not accepting connections
    '''
-    # nobody listens on port 4 :)
-    not_listening_proxy = '127.0.0.1:4'
+    sock = socket.socket()
+    sock.bind(('127.0.0.1', 0))
+    for not_listening_proxy in (
+            '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
+        worker = brozzler.BrozzlerWorker(
+                frontier=None, proxy=not_listening_proxy)
+        site = brozzler.Site(None, {
+            'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
+        page = brozzler.Page(None, {'url': 'http://example.com/'})

-    ### binding and not listening produces another type of connection
-    ### error, which we could test, but it takes a while
-    # sock = socket.socket()
-    # sock.bind(('127.0.0.1', 0))
-    # not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
-
-    worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
-
-    site = brozzler.Site(None, {'seed':'http://example.com/'})
-    page = brozzler.Page(None, {'url': 'http://example.com/'})
-
-    # robots.txt fetch
-    with pytest.raises(brozzler.ProxyError):
-        brozzler.is_permitted_by_robots(
-                site, 'http://example.com/', proxy=not_listening_proxy)
-
-    # youtube-dl fetch
-    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
-        ydl = worker._youtube_dl(tempdir, site)
+        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
-            worker._try_youtube_dl(ydl, site, page)
+            brozzler.is_permitted_by_robots(
+                    site, 'http://example.com/', proxy=not_listening_proxy)

-    # raw fetch
-    with pytest.raises(brozzler.ProxyError):
-        worker._fetch_url(site, page)
+        # youtube-dl fetch
+        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+            ydl = worker._youtube_dl(tempdir, site)
+            with pytest.raises(brozzler.ProxyError):
+                worker._try_youtube_dl(ydl, site, page)
+
+        # raw fetch
+        with pytest.raises(brozzler.ProxyError):
+            worker._fetch_url(site, page)

 def test_start_stop_backwards_compat():
    site = brozzler.Site(None, {'seed': 'http://example.com/'})