From 140a441eb515d77e50d4a4847b4a758394d27ca4 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 16 Jul 2015 17:19:12 -0700
Subject: [PATCH] honor site proxy setting; remove brozzler-worker options that
 are now configured at the site level (and in the case of ignore_cert_errors,
 always on, no longer an option); use "reppy" library for robots.txt handling;
 fix some bugs

---
 bin/brozzler-worker | 15 ++-------
 brozzler/browser.py | 30 ++++++++++-------
 brozzler/site.py    | 60 +++++----------------------------
 brozzler/worker.py  | 82 +++++++++++++++++++++------------------------
 requirements.txt    |  1 +
 5 files changed, 69 insertions(+), 119 deletions(-)

diff --git a/bin/brozzler-worker b/bin/brozzler-worker
index 95477e5..6eb8f77 100755
--- a/bin/brozzler-worker
+++ b/bin/brozzler-worker
@@ -22,12 +22,6 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi
         help='executable to use to invoke chrome')
 arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
         help='max number of chrome instances simultaneously browsing pages')
-arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
-        help='configure browser to use specified proxy server')
-arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
-        action='store_true', help='configure browser to ignore certificate errors')
-arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
-        action='store_true', help='enable special features that assume the configured proxy is warcprox')
 arg_parser.add_argument('-v', '--verbose', dest='log_level',
         action="store_const", default=logging.INFO, const=logging.DEBUG)
 arg_parser.add_argument('--version', action='version',
@@ -58,10 +52,7 @@ signal.signal(signal.SIGTERM, sigterm)
 signal.signal(signal.SIGINT, sigint)
 
 worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url,
-        max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe,
-        proxy_server=args.proxy_server,
-        ignore_cert_errors=args.ignore_cert_errors,
-        enable_warcprox_features=args.enable_warcprox_features)
+        max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
 
 worker.start()
 
@@ -70,9 +61,9 @@ try:
         time.sleep(0.5)
 except brozzler.ShutdownRequested as e:
     worker.shutdown_now()
-        
+
     for th in threading.enumerate():
         if th != threading.current_thread():
             th.join()
-            
+
 logging.info("all done, exiting")
diff --git a/brozzler/browser.py b/brozzler/browser.py
index 72973c6..dfe7573 100644
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@@ -67,11 +67,11 @@ class Browser:
 
     HARD_TIMEOUT_SECONDS = 20 * 60
 
-    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False):
+    def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
         self.command_id = itertools.count(1)
         self.chrome_port = chrome_port
         self.chrome_exe = chrome_exe
-        self.proxy_server = proxy_server
+        self.proxy = proxy
         self.ignore_cert_errors = ignore_cert_errors
         self._behavior = None
         self._websock = None
@@ -88,26 +88,30 @@ class Browser:
     def __exit__(self, *args):
         self.stop()
 
-    def start(self):
+    def start(self, proxy=None):
         # these can raise exceptions
         self._work_dir = tempfile.TemporaryDirectory()
         self._chrome_instance = Chrome(port=self.chrome_port,
                 executable=self.chrome_exe,
                 user_home_dir=self._work_dir.name,
                 user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
-                proxy_server=self.proxy_server,
-                ignore_cert_errors=self.ignore_cert_errors)
+                ignore_cert_errors=self.ignore_cert_errors,
+                proxy=proxy or self.proxy)
         self._websocket_url = self._chrome_instance.start()
 
     def stop(self):
-        self._chrome_instance.stop()
-        self._work_dir.cleanup()
+        if self._chrome_instance:
+            self._chrome_instance.stop()
+            self._chrome_instance = None
+        if self._work_dir:
+            self._work_dir.cleanup()
+            self._work_dir = None
 
     def abort_browse_page(self):
         self._abort_browse_page = True
 
     def browse_page(self, url, on_request=None, on_screenshot=None):
-        """Synchronously loads a page, takes a screenshot, and runs behaviors. 
+        """Synchronously loads a page, takes a screenshot, and runs behaviors.
 
         Raises BrowsingException if browsing the page fails in a non-critical
         way.
@@ -165,7 +169,7 @@ class Browser:
                 return True
             elif not self._waiting_on_outlinks_msg_id:
                 self.logger.info("finished browsing page according to behavior, retrieving outlinks url={}".format(self.url))
-                self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate", 
+                self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate",
                         params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
                 return False
         elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
@@ -250,12 +254,12 @@ class Browser:
 class Chrome:
     logger = logging.getLogger(__module__ + "." + __qualname__)
 
-    def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False):
+    def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
         self.port = port
         self.executable = executable
         self.user_home_dir = user_home_dir
         self.user_data_dir = user_data_dir
-        self.proxy_server = proxy_server
+        self.proxy = proxy
         self.ignore_cert_errors = ignore_cert_errors
 
     # returns websocket url to chrome window with about:blank loaded
@@ -281,8 +285,8 @@ class Chrome:
                 "--disable-web-security"]
         if self.ignore_cert_errors:
             chrome_args.append("--ignore-certificate-errors")
-        if self.proxy_server:
-            chrome_args.append("--proxy-server={}".format(self.proxy_server))
+        if self.proxy:
+            chrome_args.append("--proxy-server={}".format(self.proxy))
         chrome_args.append("about:blank")
         self.logger.info("running: {}".format(" ".join(chrome_args)))
         self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
diff --git a/brozzler/site.py b/brozzler/site.py
index 8566eb6..f9678a4 100644
--- a/brozzler/site.py
+++ b/brozzler/site.py
@@ -4,46 +4,8 @@ import surt
 import json
 import logging
 import urllib.robotparser
-import urllib.request
-
-def robots_url(url):
-    hurl = surt.handyurl.parse(url)
-    hurl.path = "/robots.txt"
-    hurl.query = None
-    hurl.hash = None
-    return hurl.geturl()
-
-class RobotFileParser(urllib.robotparser.RobotFileParser):
-    """Adds support for fetching robots.txt through a proxy to
-    urllib.robotparser.RobotFileParser."""
-
-    logger = logging.getLogger(__module__ + "." + __qualname__)
-
-    def __init__(self, url="", proxy=None):
-        super(RobotFileParser, self).__init__(url)
-        self.proxy = proxy
-
-    def read(self):
-        """Reads the robots.txt URL, perhaps through the configured proxy, and
-        feeds it to the parser."""
-        try:
-            request = urllib.request.Request(self.url)
-            if self.proxy:
-                request.set_proxy(self.proxy, request.type)
-            f = urllib.request.urlopen(request)
-        except urllib.error.HTTPError as err:
-            if err.code in (401, 403):
-                self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
-                self.disallow_all = True
-            elif err.code >= 400:
-                self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
-                self.allow_all = True
-        except BaseException as err:
-            self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
-            self.disallow_all = True
-        else:
-            raw = f.read()
-            self.parse(raw.decode("utf-8").splitlines())
+import requests
+import reppy.cache
 
 class Site:
     logger = logging.getLogger(__module__ + "." + __qualname__)
@@ -62,10 +24,15 @@ class Site:
         else:
             self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
 
-        self._robots_cache = {}  # {robots_url:RobotFileParser,...}
+        req_sesh = requests.Session()
+        req_sesh.verify = False   # ignore cert errors
+        if proxy:
+            proxie = "http://{}".format(proxy)
+            req_sesh.proxies = {"http":proxie,"https":proxie}
+        self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
 
     def is_permitted_by_robots(self, url):
-        return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
+        return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
 
     def is_in_scope(self, url):
         try:
@@ -85,15 +52,6 @@ class Site:
     def to_json(self):
         return json.dumps(self.to_dict(), separators=(',', ':'))
 
-    def _robots(self, robots_url):
-        if not robots_url in self._robots_cache:
-            robots_txt = RobotFileParser(robots_url, self.proxy)
-            self.logger.info("fetching {}".format(robots_url))
-            robots_txt.read()
-            self._robots_cache[robots_url] = robots_txt
-
-        return self._robots_cache[robots_url]
-
 class CrawlUrl:
     def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
         self.id = id
diff --git a/brozzler/worker.py b/brozzler/worker.py
index e913731..540accd 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -16,21 +16,14 @@ class BrozzlerWorker:
     logger = logging.getLogger(__module__ + "." + __qualname__)
 
     def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f",
-            max_browsers=1, chrome_exe="chromium-browser",
-            proxy_server=None, ignore_cert_errors=False, 
-            enable_warcprox_features=False):
-
+            max_browsers=1, chrome_exe="chromium-browser"):
         self._amqp_url = amqp_url
         self._max_browsers = max_browsers
-        self._proxy_server = proxy_server
-        self._enable_warcprox_features = enable_warcprox_features
-
         self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
-                chrome_exe=chrome_exe, proxy_server=proxy_server,
-                ignore_cert_errors=ignore_cert_errors)
-
+                chrome_exe=chrome_exe, ignore_cert_errors=True)
         self._shutdown_requested = threading.Event()
 
+    def _youtube_dl(self, site):
         ydl_opts = {
             "outtmpl": "/dev/null",
             "verbose": False,
@@ -42,13 +35,13 @@ class BrozzlerWorker:
             "nopart": True,
             "no_color": True,
         }
-        if self._proxy_server:
-            ydl_opts["proxy"] = "http://{}".format(self._proxy_server)
+        if site.proxy:
+            ydl_opts["proxy"] = "http://{}".format(site.proxy)
             ## XXX (sometimes?) causes chrome debug websocket to go through
             ## proxy. Maybe not needed thanks to hls_prefer_native.
             ## # see https://github.com/rg3/youtube-dl/issues/6087
-            ## os.environ["http_proxy"] = "http://{}".format(self._proxy_server)
-        self._ydl = youtube_dl.YoutubeDL(ydl_opts)
+            ## os.environ["http_proxy"] = "http://{}".format(site.proxy)
+        return youtube_dl.YoutubeDL(ydl_opts)
 
     def _next_url(self, site):
         """Raises kombu.simple.Empty if queue is empty"""
@@ -77,15 +70,15 @@ class BrozzlerWorker:
                 logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name))
                 q.put(crawl_url.to_dict())
 
-    def _putmeta(self, url, content_type, payload):
-        assert self._enable_warcprox_features
-        request = urllib.request.Request(url, method="PUTMETA", 
+    def _putmeta(self, warcprox_address, url, content_type, payload):
+        request = urllib.request.Request(url, method="PUTMETA",
                 headers={"Content-Type":content_type}, data=payload)
-    
-        # XXX evil hack to keep urllib from trying to tunnel https urls here
+
+        # XXX setting request.type="http" is a hack to stop urllib from trying
+        # to tunnel if url is https
         request.type = "http"
-        request.set_proxy("localhost:8000", "http")
-    
+        request.set_proxy(warcprox_address, "http")
+
         try:
             with urllib.request.urlopen(request) as response:
                 if response.status != 204:
@@ -93,14 +86,14 @@ class BrozzlerWorker:
         except urllib.error.HTTPError as e:
             logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))
 
-    def _try_youtube_dl(self, site, crawl_url):
+    def _try_youtube_dl(self, ydl, site, crawl_url):
         try:
             logging.info("trying youtube-dl on {}".format(crawl_url))
-            info = self._ydl.extract_info(crawl_url.url)
-            if self._proxy_server and self._enable_warcprox_features:
+            info = ydl.extract_info(crawl_url.url)
+            if site.proxy and site.enable_warcprox_features:
                 info_json = json.dumps(info, sort_keys=True, indent=4)
                 logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
-                self._putmeta(url=crawl_url.url, 
+                self._putmeta(warcprox_address, site.proxy, url=crawl_url.url,
                         content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                         payload=info_json.encode("utf-8"))
         except BaseException as e:
@@ -110,32 +103,34 @@ class BrozzlerWorker:
                 raise
 
     def _on_screenshot(self, site, crawl_url, screenshot_png):
-        if self._proxy_server and self._enable_warcprox_features:
+        if site.proxy and site.enable_warcprox_features:
             logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
-            self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
+            self._putmeta(warcprox_address=site.proxy, url=crawl_url.url,
+                    content_type="image/png", payload=screenshot_png)
 
-    def _brozzle_site(self, browser, site):
+    def _brozzle_site(self, browser, ydl, site):
         start = time.time()
         crawl_url = None
         try:
-            with browser:
-                while not self._shutdown_requested.is_set() and time.time() - start < 60:
-                    try:
-                        crawl_url = self._next_url(site)
-                        logging.info("crawling {}".format(crawl_url))
-                        self._try_youtube_dl(site, crawl_url)
-                        crawl_url.outlinks = browser.browse_page(crawl_url.url,
-                                on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
-                        self._completed_url(site, crawl_url)
-                        crawl_url = None
-                    except kombu.simple.Empty:
-                        # if some timeout reached, re-raise?
-                        pass
+            browser.start(proxy=site.proxy)
+            while not self._shutdown_requested.is_set() and time.time() - start < 60:
+                try:
+                    crawl_url = self._next_url(site)
+                    logging.info("crawling {}".format(crawl_url))
+                    self._try_youtube_dl(ydl, site, crawl_url)
+                    crawl_url.outlinks = browser.browse_page(crawl_url.url,
+                            on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
+                    self._completed_url(site, crawl_url)
+                    crawl_url = None
+                except kombu.simple.Empty:
+                    # if some timeout reached, re-raise?
+                    pass
         # except kombu.simple.Empty:
         #     logging.info("finished {} (queue is empty)".format(site))
         except brozzler.browser.BrowsingAborted:
             logging.info("{} shut down".format(browser))
         finally:
+            browser.stop()
             self._disclaim_site(site, crawl_url)
             self._browser_pool.release(browser)
 
@@ -153,7 +148,8 @@ class BrozzlerWorker:
                             site = brozzler.Site(**msg.payload)
                             msg.ack() # XXX ack only after browsing finished? kinda complicated
                             logging.info("browsing site {}".format(site))
-                            th = threading.Thread(target=lambda: self._brozzle_site(browser, site), 
+                            ydl = self._youtube_dl(site)
+                            th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
                                     name="BrowsingThread-{}".format(site.scope_surt))
                             th.start()
                         except kombu.simple.Empty:
@@ -164,7 +160,7 @@ class BrozzlerWorker:
                             latest_state = "browsers-busy"
                 else:
                     q_empty = True
-    
+
                 if q_empty:
                     if latest_state != "no-unclaimed-sites":
                         logging.info("no unclaimed sites to browse")
diff --git a/requirements.txt b/requirements.txt
index 065a8c6..db6f042 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ argparse
 PyYAML
 git+https://github.com/ikreymer/surt.git@py3
 youtube_dl
+git+https://github.com/seomoz/reppy.git