fix robots.txt proxy down test by setting site.id (cached robots is stored by site.id, and other tests that ran earlier with no site.id were interfering); and test another kind of connection error, for whatever that's worth

This commit is contained in:
Noah Levitt 2017-04-18 12:00:23 -07:00
parent dc43794363
commit ac972d399f
3 changed files with 45 additions and 42 deletions

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b11.dev233', version='1.1b11.dev234',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View file

@ -27,6 +27,7 @@ import argparse
import urllib import urllib
import json import json
import threading import threading
import socket
args = argparse.Namespace() args = argparse.Namespace()
args.log_level = logging.INFO args.log_level = logging.INFO
@ -192,20 +193,20 @@ def test_proxy_down():
Test that browsing raises `brozzler.ProxyError` when proxy is down. Test that browsing raises `brozzler.ProxyError` when proxy is down.
See also `test_proxy_down` in test_units.py. See also `test_proxy_down` in test_units.py.
Tests two different kinds of connection error:
- nothing listening the port (nobody listens on on port 4 :))
- port bound but not accepting connections
''' '''
sock = socket.socket()
sock.bind(('127.0.0.1', 0))
for not_listening_proxy in (
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
site = brozzler.Site(None, {'seed':'http://example.com/'}) site = brozzler.Site(None, {'seed':'http://example.com/'})
page = brozzler.Page(None, {'url': 'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'})
# nobody listens on port 4 :) worker = brozzler.BrozzlerWorker(
not_listening_proxy = '127.0.0.1:4' frontier=None, proxy=not_listening_proxy)
### binding and not listening produces another type of connection
### error, which we could test, but it takes a while
# sock = socket.socket()
# sock.bind(('127.0.0.1', 0))
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
chrome_exe = brozzler.suggest_default_chrome_exe() chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:

View file

@ -28,6 +28,8 @@ import yaml
import datetime import datetime
import requests import requests
import tempfile import tempfile
import uuid
import socket
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def httpd(request): def httpd(request):
@ -114,19 +116,19 @@ def test_proxy_down():
This test needs to cover every possible fetch through the proxy other than This test needs to cover every possible fetch through the proxy other than
fetches from the browser. For that, see test_brozzling.py. fetches from the browser. For that, see test_brozzling.py.
Tests two different kinds of connection error:
- nothing listening the port (nobody listens on on port 4 :))
- port bound but not accepting connections
''' '''
# nobody listens on port 4 :) sock = socket.socket()
not_listening_proxy = '127.0.0.1:4' sock.bind(('127.0.0.1', 0))
for not_listening_proxy in (
### binding and not listening produces another type of connection '127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
### error, which we could test, but it takes a while worker = brozzler.BrozzlerWorker(
# sock = socket.socket() frontier=None, proxy=not_listening_proxy)
# sock.bind(('127.0.0.1', 0)) site = brozzler.Site(None, {
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1] 'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
site = brozzler.Site(None, {'seed':'http://example.com/'})
page = brozzler.Page(None, {'url': 'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'})
# robots.txt fetch # robots.txt fetch