mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 04:44:12 -04:00
fix robots.txt proxy down test by setting site.id (cached robots is stored by site.id, and other tests that ran earlier with no site.id were interfering); and test another kind of connection error, for whatever that's worth
This commit is contained in:
parent
dc43794363
commit
ac972d399f
3 changed files with 45 additions and 42 deletions
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b11.dev233',
|
version='1.1b11.dev234',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -27,6 +27,7 @@ import argparse
|
||||||
import urllib
|
import urllib
|
||||||
import json
|
import json
|
||||||
import threading
|
import threading
|
||||||
|
import socket
|
||||||
|
|
||||||
args = argparse.Namespace()
|
args = argparse.Namespace()
|
||||||
args.log_level = logging.INFO
|
args.log_level = logging.INFO
|
||||||
|
@ -192,23 +193,23 @@ def test_proxy_down():
|
||||||
Test that browsing raises `brozzler.ProxyError` when proxy is down.
|
Test that browsing raises `brozzler.ProxyError` when proxy is down.
|
||||||
|
|
||||||
See also `test_proxy_down` in test_units.py.
|
See also `test_proxy_down` in test_units.py.
|
||||||
|
|
||||||
|
Tests two different kinds of connection error:
|
||||||
|
- nothing listening the port (nobody listens on on port 4 :))
|
||||||
|
- port bound but not accepting connections
|
||||||
'''
|
'''
|
||||||
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
sock = socket.socket()
|
||||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
sock.bind(('127.0.0.1', 0))
|
||||||
|
for not_listening_proxy in (
|
||||||
|
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
||||||
|
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
||||||
|
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||||
|
|
||||||
# nobody listens on port 4 :)
|
worker = brozzler.BrozzlerWorker(
|
||||||
not_listening_proxy = '127.0.0.1:4'
|
frontier=None, proxy=not_listening_proxy)
|
||||||
|
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||||
|
|
||||||
### binding and not listening produces another type of connection
|
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||||
### error, which we could test, but it takes a while
|
with pytest.raises(brozzler.ProxyError):
|
||||||
# sock = socket.socket()
|
worker.brozzle_page(browser, site, page)
|
||||||
# sock.bind(('127.0.0.1', 0))
|
|
||||||
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
|
|
||||||
|
|
||||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
|
||||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
|
||||||
|
|
||||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
|
||||||
with pytest.raises(brozzler.ProxyError):
|
|
||||||
worker.brozzle_page(browser, site, page)
|
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,8 @@ import yaml
|
||||||
import datetime
|
import datetime
|
||||||
import requests
|
import requests
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import uuid
|
||||||
|
import socket
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
|
@ -114,35 +116,35 @@ def test_proxy_down():
|
||||||
|
|
||||||
This test needs to cover every possible fetch through the proxy other than
|
This test needs to cover every possible fetch through the proxy other than
|
||||||
fetches from the browser. For that, see test_brozzling.py.
|
fetches from the browser. For that, see test_brozzling.py.
|
||||||
|
|
||||||
|
Tests two different kinds of connection error:
|
||||||
|
- nothing listening the port (nobody listens on on port 4 :))
|
||||||
|
- port bound but not accepting connections
|
||||||
'''
|
'''
|
||||||
# nobody listens on port 4 :)
|
sock = socket.socket()
|
||||||
not_listening_proxy = '127.0.0.1:4'
|
sock.bind(('127.0.0.1', 0))
|
||||||
|
for not_listening_proxy in (
|
||||||
|
'127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]):
|
||||||
|
worker = brozzler.BrozzlerWorker(
|
||||||
|
frontier=None, proxy=not_listening_proxy)
|
||||||
|
site = brozzler.Site(None, {
|
||||||
|
'id': str(uuid.uuid4()), 'seed': 'http://example.com/'})
|
||||||
|
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
||||||
|
|
||||||
### binding and not listening produces another type of connection
|
# robots.txt fetch
|
||||||
### error, which we could test, but it takes a while
|
|
||||||
# sock = socket.socket()
|
|
||||||
# sock.bind(('127.0.0.1', 0))
|
|
||||||
# not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
|
|
||||||
|
|
||||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
|
||||||
|
|
||||||
site = brozzler.Site(None, {'seed':'http://example.com/'})
|
|
||||||
page = brozzler.Page(None, {'url': 'http://example.com/'})
|
|
||||||
|
|
||||||
# robots.txt fetch
|
|
||||||
with pytest.raises(brozzler.ProxyError):
|
|
||||||
brozzler.is_permitted_by_robots(
|
|
||||||
site, 'http://example.com/', proxy=not_listening_proxy)
|
|
||||||
|
|
||||||
# youtube-dl fetch
|
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
|
||||||
ydl = worker._youtube_dl(tempdir, site)
|
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
worker._try_youtube_dl(ydl, site, page)
|
brozzler.is_permitted_by_robots(
|
||||||
|
site, 'http://example.com/', proxy=not_listening_proxy)
|
||||||
|
|
||||||
# raw fetch
|
# youtube-dl fetch
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
worker._fetch_url(site, page)
|
ydl = worker._youtube_dl(tempdir, site)
|
||||||
|
with pytest.raises(brozzler.ProxyError):
|
||||||
|
worker._try_youtube_dl(ydl, site, page)
|
||||||
|
|
||||||
|
# raw fetch
|
||||||
|
with pytest.raises(brozzler.ProxyError):
|
||||||
|
worker._fetch_url(site, page)
|
||||||
|
|
||||||
def test_start_stop_backwards_compat():
|
def test_start_stop_backwards_compat():
|
||||||
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
site = brozzler.Site(None, {'seed': 'http://example.com/'})
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue