Merge branch 'master' into qa

* master: raise brozzler.ProxyError in case of proxy error fetching robots.txt, doing youtube-dl, or doing raw fetch raise new exception brozzler.ProxyError in case of proxy error browsing a page make brozzle-page respect --proxy (no test for this!) oops, version bump for previous commit bubble up proxy errors fetching robots.txt, with unit test, and documentation
2025-04-21 08:06:27 -04:00 · 2017-04-17 18:15:32 -07:00 · 2017-04-17 18:15:32 -07:00 · 6844cb5bcb
commit 6844cb5bcb
parent 929f046ebb dc43794363
8 changed files with 126 additions and 15 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -29,6 +29,9 @@ class NothingToClaim(Exception):
 class CrawlStopped(Exception):
    pass

+class ProxyError(Exception):
+    pass
+
 class ReachedLimit(Exception):
    def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
        import json
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -239,6 +239,10 @@ class WebsockReceiverThread(threading.Thread):
                        message['params']['message']['text'])
            elif message['method'] == 'Page.javascriptDialogOpening':
                self._javascript_dialog_opening(message)
+            elif (message['method'] == 'Network.loadingFailed'
+                    and 'params' in message and 'errorText' in message['params']
+                    and message['params']['errorText'] == 'net::ERR_PROXY_CONNECTION_FAILED'):
+                brozzler.thread_raise(self.calling_thread, brozzler.ProxyError)
            # else:
            #     self.logger.debug("%s %s", message["method"], json_message)
        elif 'result' in message:
@ -411,7 +415,8 @@ class Browser:
            outlinks: a list of navigational links extracted from the page

        Raises:
-            BrowsingException: if browsing the page fails
+            brozzler.ProxyError: in case of proxy connection error
+            BrowsingException: if browsing the page fails in some other way
        '''
        if not self.is_running():
            raise BrowsingException('browser has not been started')
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -166,7 +166,7 @@ def brozzle_page(argv=None):
        'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
        'username': args.username, 'password': args.password})
    page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
-    worker = brozzler.BrozzlerWorker(frontier=None)
+    worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -73,6 +73,25 @@ def _robots_cache(site, proxy=None):
    return _robots_caches[site.id]

 def is_permitted_by_robots(site, url, proxy=None):
+    '''
+    Checks if `url` is permitted by robots.txt.
+
+    In case of problems fetching robots.txt, different things can happen.
+    Reppy (the robots.txt parsing library) handles some exceptions internally
+    and applies an appropriate policy. It bubbles up other exceptions. Of
+    these, there are two kinds that this function raises for the caller to
+    handle, described below. Yet other types of exceptions are caught, and the
+    fetch is retried up to 10 times. In this case, after the 10th failure, the
+    function returns `False` (i.e. forbidden by robots).
+
+    Returns:
+        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
+            by robots.txt, `False` otherwise
+
+    Raises:
+        brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
+        requests.exceptions.ProxyError: if the proxy is down
+    '''
    if site.ignore_robots:
        return True

@ -82,14 +101,23 @@ def is_permitted_by_robots(site, url, proxy=None):
            result = _robots_cache(site, proxy).allowed(
                    url, site.user_agent or "brozzler")
            return result
-        except BaseException as e:
-            if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
+        except Exception as e:
+            if isinstance(e, reppy.exceptions.ServerError) and isinstance(
+                    e.args[0], brozzler.ReachedLimit):
                raise e.args[0]
+            elif hasattr(e, 'args') and isinstance(
+                    e.args[0], requests.exceptions.ProxyError):
+                # reppy has wrapped an exception that we want to bubble up
+                raise brozzler.ProxyError(e)
            else:
                if tries_left > 0:
-                    logging.warn("caught exception fetching robots.txt (%s tries left) for %s: %s", tries_left, url, repr(e))
+                    logging.warn(
+                            "caught exception fetching robots.txt (%s tries "
+                            "left) for %s: %s", tries_left, url, repr(e))
                    tries_left -= 1
                else:
-                    logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
+                    logging.error(
+                            "caught exception fetching robots.txt (0 tries "
+                            "left) for %s: %s", url, repr(e), exc_info=True)
                    return False

--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -251,10 +251,17 @@ class BrozzlerWorker:
        except BaseException as e:
            if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
                pass
-            elif (hasattr(e, "exc_info") and e.exc_info[0] ==
-                    urllib.error.HTTPError and hasattr(e.exc_info[1], "code")
+            elif (hasattr(e, "exc_info")
+                    and e.exc_info[0] == urllib.error.HTTPError
+                    and hasattr(e.exc_info[1], "code")
                    and e.exc_info[1].code == 420):
                raise brozzler.ReachedLimit(e.exc_info[1])
+            elif (hasattr(e, 'exc_info')
+                    and e.exc_info[0] == urllib.error.URLError
+                    and self._proxy_for(site)):
+                # connection problem when using a proxy == proxy error (XXX?)
+                raise brozzler.ProxyError(
+                        'youtube-dl hit apparent proxy error', e)
            else:
                raise

@ -285,6 +292,8 @@ class BrozzlerWorker:
            raise
        except brozzler.ShutdownRequested:
            raise
+        except brozzler.ProxyError:
+            raise
        except Exception as e:
            if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
                    and hasattr(e.exc_info[1], 'code')
@ -294,7 +303,7 @@ class BrozzlerWorker:
                        e.exc_info[1].code, e.exc_info[1].msg, page.url)
            else:
                self.logger.error(
-                        "youtube_dl raised exception on %s", page,
+                        'youtube_dl raised exception on %s', page,
                        exc_info=True)

        if self._needs_browsing(page, ydl_spy):
@ -379,10 +388,13 @@ class BrozzlerWorker:
            }

        self.logger.info('fetching %s', page)
-        # response is ignored
-        requests.get(
-                page.url, proxies=proxies, headers=site.extra_headers(),
-                verify=False)
+        try:
+            # response is ignored
+            requests.get(
+                    page.url, proxies=proxies, headers=site.extra_headers(),
+                    verify=False)
+        except requests.exceptions.ProxyError as e:
+            raise brozzler.ProxyError(e)

    def _needs_browsing(self, page, brozzler_spy):
        final_bounces = brozzler_spy.final_bounces(page.url)
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b11.dev229',
+        version='1.1b11.dev233',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -26,6 +26,7 @@ import threading
 import argparse
 import urllib
 import json
+import threading

 args = argparse.Namespace()
 args.log_level = logging.INFO
@ -186,3 +187,28 @@ def test_extract_outlinks(httpd):
        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
    }

+def test_proxy_down():
+    '''
+    Test that browsing raises `brozzler.ProxyError` when proxy is down.
+
+    See also `test_proxy_down` in test_units.py.
+    '''
+    site = brozzler.Site(None, {'seed':'http://example.com/'})
+    page = brozzler.Page(None, {'url': 'http://example.com/'})
+
+    # nobody listens on port 4 :)
+    not_listening_proxy = '127.0.0.1:4'
+
+    ### binding and not listening produces another type of connection
+    ### error, which we could test, but it takes a while
+    # sock = socket.socket()
+    # sock.bind(('127.0.0.1', 0))
+    # not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
+
+    worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
+    chrome_exe = brozzler.suggest_default_chrome_exe()
+
+    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
+        with pytest.raises(brozzler.ProxyError):
+            worker.brozzle_page(browser, site, page)
+
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -23,10 +23,11 @@ import threading
 import os
 import brozzler
 import brozzler.chrome
-import socket
 import logging
 import yaml
 import datetime
+import requests
+import tempfile

@pytest.fixture(scope='module')
 def httpd(request):
@ -107,6 +108,42 @@ blocks:
    assert site.is_in_scope(
            'https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)

+def test_proxy_down():
+    '''
+    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
+
+    This test needs to cover every possible fetch through the proxy other than
+    fetches from the browser. For that, see test_brozzling.py.
+    '''
+    # nobody listens on port 4 :)
+    not_listening_proxy = '127.0.0.1:4'
+
+    ### binding and not listening produces another type of connection
+    ### error, which we could test, but it takes a while
+    # sock = socket.socket()
+    # sock.bind(('127.0.0.1', 0))
+    # not_listening_proxy = '127.0.0.1:%s' % sock.getsockname()[1]
+
+    worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
+
+    site = brozzler.Site(None, {'seed':'http://example.com/'})
+    page = brozzler.Page(None, {'url': 'http://example.com/'})
+
+    # robots.txt fetch
+    with pytest.raises(brozzler.ProxyError):
+        brozzler.is_permitted_by_robots(
+                site, 'http://example.com/', proxy=not_listening_proxy)
+
+    # youtube-dl fetch
+    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
+        ydl = worker._youtube_dl(tempdir, site)
+        with pytest.raises(brozzler.ProxyError):
+            worker._try_youtube_dl(ydl, site, page)
+
+    # raw fetch
+    with pytest.raises(brozzler.ProxyError):
+        worker._fetch_url(site, page)
+
 def test_start_stop_backwards_compat():
    site = brozzler.Site(None, {'seed': 'http://example.com/'})
    assert len(site.starts_and_stops) == 1