honor site proxy setting; remove brozzler-worker options that are now configured at the site level (and in the case of ignore_cert_errors, always on, no longer an option); use "reppy" library for robots.txt handling; fix some bugs

2025-08-01 19:16:15 -04:00 · 2015-07-16 17:19:12 -07:00 · 2015-07-16 17:19:12 -07:00 · 140a441eb5
commit 140a441eb5
parent e04247c3f7
5 changed files with 69 additions and 119 deletions
--- a/bin/brozzler-worker
+++ b/bin/brozzler-worker
@ -22,12 +22,6 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi
        help='executable to use to invoke chrome')
 arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
        help='max number of chrome instances simultaneously browsing pages')
-arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
-        help='configure browser to use specified proxy server')
-arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
-        action='store_true', help='configure browser to ignore certificate errors')
-arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
-        action='store_true', help='enable special features that assume the configured proxy is warcprox')
 arg_parser.add_argument('-v', '--verbose', dest='log_level',
        action="store_const", default=logging.INFO, const=logging.DEBUG)
 arg_parser.add_argument('--version', action='version',
@ -58,10 +52,7 @@ signal.signal(signal.SIGTERM, sigterm)
 signal.signal(signal.SIGINT, sigint)

 worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url,
-        max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe,
-        proxy_server=args.proxy_server,
-        ignore_cert_errors=args.ignore_cert_errors,
-        enable_warcprox_features=args.enable_warcprox_features)
+        max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)

 worker.start()

@ -70,9 +61,9 @@ try:
        time.sleep(0.5)
 except brozzler.ShutdownRequested as e:
    worker.shutdown_now()
-        
+
    for th in threading.enumerate():
        if th != threading.current_thread():
            th.join()
-            
+
 logging.info("all done, exiting")
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -67,11 +67,11 @@ class Browser:

    HARD_TIMEOUT_SECONDS = 20 * 60

-    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False):
+    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
        self.command_id = itertools.count(1)
        self.chrome_port = chrome_port
        self.chrome_exe = chrome_exe
-        self.proxy_server = proxy_server
+        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors
        self._behavior = None
        self._websock = None
@ -88,26 +88,30 @@ class Browser:
    def __exit__(self, *args):
        self.stop()

-    def start(self):
+    def start(self, proxy=None):
        # these can raise exceptions
        self._work_dir = tempfile.TemporaryDirectory()
        self._chrome_instance = Chrome(port=self.chrome_port,
                executable=self.chrome_exe,
                user_home_dir=self._work_dir.name,
                user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
-                proxy_server=self.proxy_server,
-                ignore_cert_errors=self.ignore_cert_errors)
+                ignore_cert_errors=self.ignore_cert_errors,
+                proxy=proxy or self.proxy)
        self._websocket_url = self._chrome_instance.start()

    def stop(self):
-        self._chrome_instance.stop()
-        self._work_dir.cleanup()
+        if self._chrome_instance:
+            self._chrome_instance.stop()
+            self._chrome_instance = None
+        if self._work_dir:
+            self._work_dir.cleanup()
+            self._work_dir = None

    def abort_browse_page(self):
        self._abort_browse_page = True

    def browse_page(self, url, on_request=None, on_screenshot=None):
-        """Synchronously loads a page, takes a screenshot, and runs behaviors. 
+        """Synchronously loads a page, takes a screenshot, and runs behaviors.

        Raises BrowsingException if browsing the page fails in a non-critical
        way.
@ -165,7 +169,7 @@ class Browser:
                return True
            elif not self._waiting_on_outlinks_msg_id:
                self.logger.info("finished browsing page according to behavior, retrieving outlinks url={}".format(self.url))
-                self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate", 
+                self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate",
                        params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
                return False
        elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
@ -250,12 +254,12 @@ class Browser:
 class Chrome:
    logger = logging.getLogger(__module__ + "." + __qualname__)

-    def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False):
+    def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
        self.port = port
        self.executable = executable
        self.user_home_dir = user_home_dir
        self.user_data_dir = user_data_dir
-        self.proxy_server = proxy_server
+        self.proxy = proxy
        self.ignore_cert_errors = ignore_cert_errors

    # returns websocket url to chrome window with about:blank loaded
@ -281,8 +285,8 @@ class Chrome:
                "--disable-web-security"]
        if self.ignore_cert_errors:
            chrome_args.append("--ignore-certificate-errors")
-        if self.proxy_server:
-            chrome_args.append("--proxy-server={}".format(self.proxy_server))
+        if self.proxy:
+            chrome_args.append("--proxy-server={}".format(self.proxy))
        chrome_args.append("about:blank")
        self.logger.info("running: {}".format(" ".join(chrome_args)))
        self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -4,46 +4,8 @@ import surt
 import json
 import logging
 import urllib.robotparser
-import urllib.request
-
-def robots_url(url):
-    hurl = surt.handyurl.parse(url)
-    hurl.path = "/robots.txt"
-    hurl.query = None
-    hurl.hash = None
-    return hurl.geturl()
-
-class RobotFileParser(urllib.robotparser.RobotFileParser):
-    """Adds support for fetching robots.txt through a proxy to
-    urllib.robotparser.RobotFileParser."""
-
-    logger = logging.getLogger(__module__ + "." + __qualname__)
-
-    def __init__(self, url="", proxy=None):
-        super(RobotFileParser, self).__init__(url)
-        self.proxy = proxy
-
-    def read(self):
-        """Reads the robots.txt URL, perhaps through the configured proxy, and
-        feeds it to the parser."""
-        try:
-            request = urllib.request.Request(self.url)
-            if self.proxy:
-                request.set_proxy(self.proxy, request.type)
-            f = urllib.request.urlopen(request)
-        except urllib.error.HTTPError as err:
-            if err.code in (401, 403):
-                self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
-                self.disallow_all = True
-            elif err.code >= 400:
-                self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
-                self.allow_all = True
-        except BaseException as err:
-            self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
-            self.disallow_all = True
-        else:
-            raw = f.read()
-            self.parse(raw.decode("utf-8").splitlines())
+import requests
+import reppy.cache

 class Site:
    logger = logging.getLogger(__module__ + "." + __qualname__)
@ -62,10 +24,15 @@ class Site:
        else:
            self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)

-        self._robots_cache = {}  # {robots_url:RobotFileParser,...}
+        req_sesh = requests.Session()
+        req_sesh.verify = False   # ignore cert errors
+        if proxy:
+            proxie = "http://{}".format(proxy)
+            req_sesh.proxies = {"http":proxie,"https":proxie}
+        self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)

    def is_permitted_by_robots(self, url):
-        return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
+        return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")

    def is_in_scope(self, url):
        try:
@ -85,15 +52,6 @@ class Site:
    def to_json(self):
        return json.dumps(self.to_dict(), separators=(',', ':'))

-    def _robots(self, robots_url):
-        if not robots_url in self._robots_cache:
-            robots_txt = RobotFileParser(robots_url, self.proxy)
-            self.logger.info("fetching {}".format(robots_url))
-            robots_txt.read()
-            self._robots_cache[robots_url] = robots_txt
-
-        return self._robots_cache[robots_url]
-
 class CrawlUrl:
    def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
        self.id = id
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -16,21 +16,14 @@ class BrozzlerWorker:
    logger = logging.getLogger(__module__ + "." + __qualname__)

    def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f",
-            max_browsers=1, chrome_exe="chromium-browser",
-            proxy_server=None, ignore_cert_errors=False, 
-            enable_warcprox_features=False):
-
+            max_browsers=1, chrome_exe="chromium-browser"):
        self._amqp_url = amqp_url
        self._max_browsers = max_browsers
-        self._proxy_server = proxy_server
-        self._enable_warcprox_features = enable_warcprox_features
-
        self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
-                chrome_exe=chrome_exe, proxy_server=proxy_server,
-                ignore_cert_errors=ignore_cert_errors)
-
+                chrome_exe=chrome_exe, ignore_cert_errors=True)
        self._shutdown_requested = threading.Event()

+    def _youtube_dl(self, site):
        ydl_opts = {
            "outtmpl": "/dev/null",
            "verbose": False,
@ -42,13 +35,13 @@ class BrozzlerWorker:
            "nopart": True,
            "no_color": True,
        }
-        if self._proxy_server:
-            ydl_opts["proxy"] = "http://{}".format(self._proxy_server)
+        if site.proxy:
+            ydl_opts["proxy"] = "http://{}".format(site.proxy)
            ## XXX (sometimes?) causes chrome debug websocket to go through
            ## proxy. Maybe not needed thanks to hls_prefer_native.
            ## # see https://github.com/rg3/youtube-dl/issues/6087
-            ## os.environ["http_proxy"] = "http://{}".format(self._proxy_server)
-        self._ydl = youtube_dl.YoutubeDL(ydl_opts)
+            ## os.environ["http_proxy"] = "http://{}".format(site.proxy)
+        return youtube_dl.YoutubeDL(ydl_opts)

    def _next_url(self, site):
        """Raises kombu.simple.Empty if queue is empty"""
@ -77,15 +70,15 @@ class BrozzlerWorker:
                logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name))
                q.put(crawl_url.to_dict())

-    def _putmeta(self, url, content_type, payload):
-        assert self._enable_warcprox_features
-        request = urllib.request.Request(url, method="PUTMETA", 
+    def _putmeta(self, warcprox_address, url, content_type, payload):
+        request = urllib.request.Request(url, method="PUTMETA",
                headers={"Content-Type":content_type}, data=payload)
-    
-        # XXX evil hack to keep urllib from trying to tunnel https urls here
+
+        # XXX setting request.type="http" is a hack to stop urllib from trying
+        # to tunnel if url is https
        request.type = "http"
-        request.set_proxy("localhost:8000", "http")
-    
+        request.set_proxy(warcprox_address, "http")
+
        try:
            with urllib.request.urlopen(request) as response:
                if response.status != 204:
@ -93,14 +86,14 @@ class BrozzlerWorker:
        except urllib.error.HTTPError as e:
            logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))

-    def _try_youtube_dl(self, site, crawl_url):
+    def _try_youtube_dl(self, ydl, site, crawl_url):
        try:
            logging.info("trying youtube-dl on {}".format(crawl_url))
-            info = self._ydl.extract_info(crawl_url.url)
-            if self._proxy_server and self._enable_warcprox_features:
+            info = ydl.extract_info(crawl_url.url)
+            if site.proxy and site.enable_warcprox_features:
                info_json = json.dumps(info, sort_keys=True, indent=4)
                logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
-                self._putmeta(url=crawl_url.url, 
+                self._putmeta(warcprox_address, site.proxy, url=crawl_url.url,
                        content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                        payload=info_json.encode("utf-8"))
        except BaseException as e:
@ -110,32 +103,34 @@ class BrozzlerWorker:
                raise

    def _on_screenshot(self, site, crawl_url, screenshot_png):
-        if self._proxy_server and self._enable_warcprox_features:
+        if site.proxy and site.enable_warcprox_features:
            logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
-            self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
+            self._putmeta(warcprox_address=site.proxy, url=crawl_url.url,
+                    content_type="image/png", payload=screenshot_png)

-    def _brozzle_site(self, browser, site):
+    def _brozzle_site(self, browser, ydl, site):
        start = time.time()
        crawl_url = None
        try:
-            with browser:
-                while not self._shutdown_requested.is_set() and time.time() - start < 60:
-                    try:
-                        crawl_url = self._next_url(site)
-                        logging.info("crawling {}".format(crawl_url))
-                        self._try_youtube_dl(site, crawl_url)
-                        crawl_url.outlinks = browser.browse_page(crawl_url.url,
-                                on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
-                        self._completed_url(site, crawl_url)
-                        crawl_url = None
-                    except kombu.simple.Empty:
-                        # if some timeout reached, re-raise?
-                        pass
+            browser.start(proxy=site.proxy)
+            while not self._shutdown_requested.is_set() and time.time() - start < 60:
+                try:
+                    crawl_url = self._next_url(site)
+                    logging.info("crawling {}".format(crawl_url))
+                    self._try_youtube_dl(ydl, site, crawl_url)
+                    crawl_url.outlinks = browser.browse_page(crawl_url.url,
+                            on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
+                    self._completed_url(site, crawl_url)
+                    crawl_url = None
+                except kombu.simple.Empty:
+                    # if some timeout reached, re-raise?
+                    pass
        # except kombu.simple.Empty:
        #     logging.info("finished {} (queue is empty)".format(site))
        except brozzler.browser.BrowsingAborted:
            logging.info("{} shut down".format(browser))
        finally:
+            browser.stop()
            self._disclaim_site(site, crawl_url)
            self._browser_pool.release(browser)

@ -153,7 +148,8 @@ class BrozzlerWorker:
                            site = brozzler.Site(**msg.payload)
                            msg.ack() # XXX ack only after browsing finished? kinda complicated
                            logging.info("browsing site {}".format(site))
-                            th = threading.Thread(target=lambda: self._brozzle_site(browser, site), 
+                            ydl = self._youtube_dl(site)
+                            th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
                                    name="BrowsingThread-{}".format(site.scope_surt))
                            th.start()
                        except kombu.simple.Empty:
@ -164,7 +160,7 @@ class BrozzlerWorker:
                            latest_state = "browsers-busy"
                else:
                    q_empty = True
-    
+
                if q_empty:
                    if latest_state != "no-unclaimed-sites":
                        logging.info("no unclaimed sites to browse")
--- a/requirements.txt
+++ b/requirements.txt
@ -4,3 +4,4 @@ argparse
 PyYAML
 git+https://github.com/ikreymer/surt.git@py3
 youtube_dl
+git+https://github.com/seomoz/reppy.git