honor site proxy setting; remove brozzler-worker options that are now configured at the site level (and in the case of ignore_cert_errors, always on, no longer an option); use "reppy" library for robots.txt handling; fix some bugs

2025-07-14 10:39:27 -04:00 · 2015-07-16 17:19:12 -07:00 · 2015-07-16 17:19:12 -07:00 · 140a441eb5
commit 140a441eb5
parent e04247c3f7
5 changed files with 69 additions and 119 deletions
--- a/bin/brozzler-worker
+++ b/bin/brozzler-worker
@ -22,12 +22,6 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi
        help='executable to use to invoke chrome')
 arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
        help='max number of chrome instances simultaneously browsing pages')
 arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
        help='configure browser to use specified proxy server')
 arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
        action='store_true', help='configure browser to ignore certificate errors')
 arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
        action='store_true', help='enable special features that assume the configured proxy is warcprox')
 arg_parser.add_argument('-v', '--verbose', dest='log_level',
        action="store_const", default=logging.INFO, const=logging.DEBUG)
 arg_parser.add_argument('--version', action='version',
@ -58,10 +52,7 @@ signal.signal(signal.SIGTERM, sigterm)
 signal.signal(signal.SIGINT, sigint)
 worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url,
-        max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe,
+        max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
        proxy_server=args.proxy_server,
        ignore_cert_errors=args.ignore_cert_errors,
        enable_warcprox_features=args.enable_warcprox_features)
 worker.start()
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -67,11 +67,11 @@ class Browser:
    HARD_TIMEOUT_SECONDS = 20 * 60
-    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False):
+    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
-        self.proxy_server = proxy_server
+        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors
        self._behavior = None
        self._websock = None
@ -88,20 +88,24 @@ class Browser:
    def __exit__(self, *args):
        self.stop()
-    def start(self):
+    def start(self, proxy=None):
        # these can raise exceptions
        self._work_dir = tempfile.TemporaryDirectory()
        self._chrome_instance = Chrome(port=self.chrome_port,
                executable=self.chrome_exe,
                user_home_dir=self._work_dir.name,
                user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
-                proxy_server=self.proxy_server,
+                ignore_cert_errors=self.ignore_cert_errors,
-                ignore_cert_errors=self.ignore_cert_errors)
+                proxy=proxy or self.proxy)
        self._websocket_url = self._chrome_instance.start()
    def stop(self):
-        self._chrome_instance.stop()
+        if self._chrome_instance:
-        self._work_dir.cleanup()
+            self._chrome_instance.stop()
            self._chrome_instance = None
        if self._work_dir:
            self._work_dir.cleanup()
            self._work_dir = None
    def abort_browse_page(self):
        self._abort_browse_page = True
@ -250,12 +254,12 @@ class Browser:
 class Chrome:
    logger = logging.getLogger(__module__ + "." + __qualname__)
-    def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False):
+    def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
        self.port = port
        self.executable = executable
        self.user_home_dir = user_home_dir
        self.user_data_dir = user_data_dir
-        self.proxy_server = proxy_server
+        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors
    # returns websocket url to chrome window with about:blank loaded
@ -281,8 +285,8 @@ class Chrome:
                "--disable-web-security"]
        if self.ignore_cert_errors:
            chrome_args.append("--ignore-certificate-errors")
-        if self.proxy_server:
+        if self.proxy:
-            chrome_args.append("--proxy-server={}".format(self.proxy_server))
+            chrome_args.append("--proxy-server={}".format(self.proxy))
        chrome_args.append("about:blank")
        self.logger.info("running: {}".format(" ".join(chrome_args)))
        self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -4,46 +4,8 @@ import surt
 import json
 import logging
 import urllib.robotparser
-import urllib.request
+import requests
-
+import reppy.cache
 def robots_url(url):
    hurl = surt.handyurl.parse(url)
    hurl.path = "/robots.txt"
    hurl.query = None
    hurl.hash = None
    return hurl.geturl()
 class RobotFileParser(urllib.robotparser.RobotFileParser):
    """Adds support for fetching robots.txt through a proxy to
    urllib.robotparser.RobotFileParser."""
    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self, url="", proxy=None):
        super(RobotFileParser, self).__init__(url)
        self.proxy = proxy
    def read(self):
        """Reads the robots.txt URL, perhaps through the configured proxy, and
        feeds it to the parser."""
        try:
            request = urllib.request.Request(self.url)
            if self.proxy:
                request.set_proxy(self.proxy, request.type)
            f = urllib.request.urlopen(request)
        except urllib.error.HTTPError as err:
            if err.code in (401, 403):
                self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
                self.disallow_all = True
            elif err.code >= 400:
                self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
                self.allow_all = True
        except BaseException as err:
            self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
            self.disallow_all = True
        else:
            raw = f.read()
            self.parse(raw.decode("utf-8").splitlines())
 class Site:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -62,10 +24,15 @@ class Site:
        else:
            self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
-        self._robots_cache = {}  # {robots_url:RobotFileParser,...}
+        req_sesh = requests.Session()
        req_sesh.verify = False   # ignore cert errors
        if proxy:
            proxie = "http://{}".format(proxy)
            req_sesh.proxies = {"http":proxie,"https":proxie}
        self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
    def is_permitted_by_robots(self, url):
-        return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
+        return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
    def is_in_scope(self, url):
        try:
@ -85,15 +52,6 @@ class Site:
    def to_json(self):
        return json.dumps(self.to_dict(), separators=(',', ':'))
    def _robots(self, robots_url):
        if not robots_url in self._robots_cache:
            robots_txt = RobotFileParser(robots_url, self.proxy)
            self.logger.info("fetching {}".format(robots_url))
            robots_txt.read()
            self._robots_cache[robots_url] = robots_txt
        return self._robots_cache[robots_url]
 class CrawlUrl:
    def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
        self.id = id
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -16,21 +16,14 @@ class BrozzlerWorker:
    logger = logging.getLogger(__module__ + "." + __qualname__)
    def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f",
-            max_browsers=1, chrome_exe="chromium-browser",
+            max_browsers=1, chrome_exe="chromium-browser"):
            proxy_server=None, ignore_cert_errors=False, 
            enable_warcprox_features=False):
        self._amqp_url = amqp_url
        self._max_browsers = max_browsers
        self._proxy_server = proxy_server
        self._enable_warcprox_features = enable_warcprox_features
        self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
-                chrome_exe=chrome_exe, proxy_server=proxy_server,
+                chrome_exe=chrome_exe, ignore_cert_errors=True)
                ignore_cert_errors=ignore_cert_errors)
        self._shutdown_requested = threading.Event()
    def _youtube_dl(self, site):
        ydl_opts = {
            "outtmpl": "/dev/null",
            "verbose": False,
@ -42,13 +35,13 @@ class BrozzlerWorker:
            "nopart": True,
            "no_color": True,
        }
-        if self._proxy_server:
+        if site.proxy:
-            ydl_opts["proxy"] = "http://{}".format(self._proxy_server)
+            ydl_opts["proxy"] = "http://{}".format(site.proxy)
            ## XXX (sometimes?) causes chrome debug websocket to go through
            ## proxy. Maybe not needed thanks to hls_prefer_native.
            ## # see https://github.com/rg3/youtube-dl/issues/6087
-            ## os.environ["http_proxy"] = "http://{}".format(self._proxy_server)
+            ## os.environ["http_proxy"] = "http://{}".format(site.proxy)
-        self._ydl = youtube_dl.YoutubeDL(ydl_opts)
+        return youtube_dl.YoutubeDL(ydl_opts)
    def _next_url(self, site):
        """Raises kombu.simple.Empty if queue is empty"""
@ -77,14 +70,14 @@ class BrozzlerWorker:
                logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name))
                q.put(crawl_url.to_dict())
-    def _putmeta(self, url, content_type, payload):
+    def _putmeta(self, warcprox_address, url, content_type, payload):
        assert self._enable_warcprox_features
        request = urllib.request.Request(url, method="PUTMETA",
                headers={"Content-Type":content_type}, data=payload)
-        # XXX evil hack to keep urllib from trying to tunnel https urls here
+        # XXX setting request.type="http" is a hack to stop urllib from trying
        # to tunnel if url is https
        request.type = "http"
-        request.set_proxy("localhost:8000", "http")
+        request.set_proxy(warcprox_address, "http")
        try:
            with urllib.request.urlopen(request) as response:
@ -93,14 +86,14 @@ class BrozzlerWorker:
        except urllib.error.HTTPError as e:
            logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))
-    def _try_youtube_dl(self, site, crawl_url):
+    def _try_youtube_dl(self, ydl, site, crawl_url):
        try:
            logging.info("trying youtube-dl on {}".format(crawl_url))
-            info = self._ydl.extract_info(crawl_url.url)
+            info = ydl.extract_info(crawl_url.url)
-            if self._proxy_server and self._enable_warcprox_features:
+            if site.proxy and site.enable_warcprox_features:
                info_json = json.dumps(info, sort_keys=True, indent=4)
                logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
-                self._putmeta(url=crawl_url.url, 
+                self._putmeta(warcprox_address, site.proxy, url=crawl_url.url,
                        content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                        payload=info_json.encode("utf-8"))
        except BaseException as e:
@ -110,32 +103,34 @@ class BrozzlerWorker:
                raise
    def _on_screenshot(self, site, crawl_url, screenshot_png):
-        if self._proxy_server and self._enable_warcprox_features:
+        if site.proxy and site.enable_warcprox_features:
            logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
-            self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
+            self._putmeta(warcprox_address=site.proxy, url=crawl_url.url,
                    content_type="image/png", payload=screenshot_png)
-    def _brozzle_site(self, browser, site):
+    def _brozzle_site(self, browser, ydl, site):
        start = time.time()
        crawl_url = None
        try:
-            with browser:
+            browser.start(proxy=site.proxy)
-                while not self._shutdown_requested.is_set() and time.time() - start < 60:
+            while not self._shutdown_requested.is_set() and time.time() - start < 60:
-                    try:
+                try:
-                        crawl_url = self._next_url(site)
+                    crawl_url = self._next_url(site)
-                        logging.info("crawling {}".format(crawl_url))
+                    logging.info("crawling {}".format(crawl_url))
-                        self._try_youtube_dl(site, crawl_url)
+                    self._try_youtube_dl(ydl, site, crawl_url)
-                        crawl_url.outlinks = browser.browse_page(crawl_url.url,
+                    crawl_url.outlinks = browser.browse_page(crawl_url.url,
-                                on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
+                            on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
-                        self._completed_url(site, crawl_url)
+                    self._completed_url(site, crawl_url)
-                        crawl_url = None
+                    crawl_url = None
-                    except kombu.simple.Empty:
+                except kombu.simple.Empty:
-                        # if some timeout reached, re-raise?
+                    # if some timeout reached, re-raise?
-                        pass
+                    pass
        # except kombu.simple.Empty:
        #     logging.info("finished {} (queue is empty)".format(site))
        except brozzler.browser.BrowsingAborted:
            logging.info("{} shut down".format(browser))
        finally:
            browser.stop()
            self._disclaim_site(site, crawl_url)
            self._browser_pool.release(browser)
@ -153,7 +148,8 @@ class BrozzlerWorker:
                            site = brozzler.Site(**msg.payload)
                            msg.ack() # XXX ack only after browsing finished? kinda complicated
                            logging.info("browsing site {}".format(site))
-                            th = threading.Thread(target=lambda: self._brozzle_site(browser, site), 
+                            ydl = self._youtube_dl(site)
                            th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
                                    name="BrowsingThread-{}".format(site.scope_surt))
                            th.start()
                        except kombu.simple.Empty:
--- a/requirements.txt
+++ b/requirements.txt
@ -4,3 +4,4 @@ argparse
 PyYAML
 git+https://github.com/ikreymer/surt.git@py3
 youtube_dl
 git+https://github.com/seomoz/reppy.git