raise brozzler.ProxyError in case of proxy error fetching robots.txt, doing youtube-dl, or doing raw fetch

2025-06-06 14:22:14 -04:00 · 2017-04-17 18:15:22 -07:00 · 2017-04-17 18:15:22 -07:00 · dc43794363
commit dc43794363
parent 349b41ab32
4 changed files with 61 additions and 26 deletions
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -101,14 +101,14 @@ def is_permitted_by_robots(site, url, proxy=None):
            result = _robots_cache(site, proxy).allowed(
                    url, site.user_agent or "brozzler")
            return result
-        except BaseException as e:
+        except Exception as e:
-            if (isinstance(e, reppy.exceptions.ServerError)
+            if isinstance(e, reppy.exceptions.ServerError) and isinstance(
-                    and isinstance(e.args[0], brozzler.ReachedLimit)) or (
+                    e.args[0], brozzler.ReachedLimit):
                            isinstance(e, reppy.exceptions.ConnectionException)
                            and isinstance(
                                e.args[0], requests.exceptions.ProxyError)):
                # reppy has wrapped an exception that we want to bubble up
                raise e.args[0]
            elif hasattr(e, 'args') and isinstance(
                    e.args[0], requests.exceptions.ProxyError):
                # reppy has wrapped an exception that we want to bubble up
                raise brozzler.ProxyError(e)
            else:
                if tries_left > 0:
                    logging.warn(
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -251,10 +251,17 @@ class BrozzlerWorker:
        except BaseException as e:
            if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
-            elif (hasattr(e, "exc_info") and e.exc_info[0] ==
+            elif (hasattr(e, "exc_info")
-                    urllib.error.HTTPError and hasattr(e.exc_info[1], "code")
+                    and e.exc_info[0] == urllib.error.HTTPError
                    and hasattr(e.exc_info[1], "code")
                    and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
            elif (hasattr(e, 'exc_info')
                    and e.exc_info[0] == urllib.error.URLError
                    and self._proxy_for(site)):
                # connection problem when using a proxy == proxy error (XXX?)
                raise brozzler.ProxyError(
                        'youtube-dl hit apparent proxy error', e)
            else:
                raise
@ -285,6 +292,8 @@ class BrozzlerWorker:
            raise
        except brozzler.ShutdownRequested:
            raise
        except brozzler.ProxyError:
            raise
        except Exception as e:
            if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
                    and hasattr(e.exc_info[1], 'code')
@ -294,7 +303,7 @@ class BrozzlerWorker:
                        e.exc_info[1].code, e.exc_info[1].msg, page.url)
            else:
                self.logger.error(
-                        "youtube_dl raised exception on %s", page,
+                        'youtube_dl raised exception on %s', page,
                        exc_info=True)
        if self._needs_browsing(page, ydl_spy):
@ -379,10 +388,13 @@ class BrozzlerWorker:
            }
        self.logger.info('fetching %s', page)
-        # response is ignored
+        try:
-        requests.get(
+            # response is ignored
-                page.url, proxies=proxies, headers=site.extra_headers(),
+            requests.get(
-                verify=False)
+                    page.url, proxies=proxies, headers=site.extra_headers(),
                    verify=False)
        except requests.exceptions.ProxyError as e:
            raise brozzler.ProxyError(e)
    def _needs_browsing(self, page, brozzler_spy):
        final_bounces = brozzler_spy.final_bounces(page.url)
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
 setuptools.setup(
        name='brozzler',
-        version='1.1b11.dev232',
+        version='1.1b11.dev233',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -23,11 +23,11 @@ import threading
 import os
 import brozzler
 import brozzler.chrome
 import socket
 import logging
 import yaml
 import datetime
 import requests
 import tempfile
@pytest.fixture(scope='module')
 def httpd(request):
@ -108,18 +108,41 @@ blocks:
    assert site.is_in_scope(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
-def test_robots_proxy_down(httpd):
+def test_proxy_down():
    '''
-    Test that exception fetching robots.txt bubbles up if proxy is down.
+    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
    '''
    url = 'http://localhost:%s/' % httpd.server_port
    site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'})
-    sock = socket.socket()
+    This test needs to cover every possible fetch through the proxy other than
-    sock.bind(('127.0.0.1', 0))
+    fetches from the browser. For that, see test_brozzling.py.
-    not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
+    '''
-    with pytest.raises(requests.exceptions.ProxyError):
+    # nobody listens on port 4 :)
-        brozzler.is_permitted_by_robots(site, url, proxy=not_listening_proxy)
+    not_listening_proxy = '127.0.0.1:4'
    ### binding and not listening produces another type of connection
    ### error, which we could test, but it takes a while
    # sock = socket.socket()
    # sock.bind(('127.0.0.1', 0))
    # not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
    worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
    site = brozzler.Site(None, {'seed':'http://example.com/'})
    page = brozzler.Page(None, {'url': 'http://example.com/'})
    # robots.txt fetch
    with pytest.raises(brozzler.ProxyError):
        brozzler.is_permitted_by_robots(
                site, 'http://example.com/', proxy=not_listening_proxy)
    # youtube-dl fetch
    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
        ydl = worker._youtube_dl(tempdir, site)
        with pytest.raises(brozzler.ProxyError):
            worker._try_youtube_dl(ydl, site, page)
    # raw fetch
    with pytest.raises(brozzler.ProxyError):
        worker._fetch_url(site, page)
 def test_start_stop_backwards_compat():
    site = brozzler.Site(None, {'seed': 'http://example.com/'})