From 140a441eb515d77e50d4a4847b4a758394d27ca4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 16 Jul 2015 17:19:12 -0700 Subject: [PATCH] honor site proxy setting; remove brozzler-worker options that are now configured at the site level (and in the case of ignore_cert_errors, always on, no longer an option); use "reppy" library for robots.txt handling; fix some bugs --- bin/brozzler-worker | 15 ++------- brozzler/browser.py | 30 ++++++++++------- brozzler/site.py | 60 +++++---------------------------- brozzler/worker.py | 82 +++++++++++++++++++++------------------------ requirements.txt | 1 + 5 files changed, 69 insertions(+), 119 deletions(-) diff --git a/bin/brozzler-worker b/bin/brozzler-worker index 95477e5..6eb8f77 100755 --- a/bin/brozzler-worker +++ b/bin/brozzler-worker @@ -22,12 +22,6 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi help='executable to use to invoke chrome') arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') -arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None, - help='configure browser to use specified proxy server') -arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors', - action='store_true', help='configure browser to ignore certificate errors') -arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features', - action='store_true', help='enable special features that assume the configured proxy is warcprox') arg_parser.add_argument('-v', '--verbose', dest='log_level', action="store_const", default=logging.INFO, const=logging.DEBUG) arg_parser.add_argument('--version', action='version', @@ -58,10 +52,7 @@ signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGINT, sigint) worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url, - max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, - proxy_server=args.proxy_server, - ignore_cert_errors=args.ignore_cert_errors, - enable_warcprox_features=args.enable_warcprox_features) + max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe) worker.start() @@ -70,9 +61,9 @@ try: time.sleep(0.5) except brozzler.ShutdownRequested as e: worker.shutdown_now() - + for th in threading.enumerate(): if th != threading.current_thread(): th.join() - + logging.info("all done, exiting") diff --git a/brozzler/browser.py b/brozzler/browser.py index 72973c6..dfe7573 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -67,11 +67,11 @@ class Browser: HARD_TIMEOUT_SECONDS = 20 * 60 - def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False): + def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False): self.command_id = itertools.count(1) self.chrome_port = chrome_port self.chrome_exe = chrome_exe - self.proxy_server = proxy_server + self.proxy = proxy self.ignore_cert_errors = ignore_cert_errors self._behavior = None self._websock = None @@ -88,26 +88,30 @@ class Browser: def __exit__(self, *args): self.stop() - def start(self): + def start(self, proxy=None): # these can raise exceptions self._work_dir = tempfile.TemporaryDirectory() self._chrome_instance = Chrome(port=self.chrome_port, executable=self.chrome_exe, user_home_dir=self._work_dir.name, user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]), - proxy_server=self.proxy_server, - ignore_cert_errors=self.ignore_cert_errors) + ignore_cert_errors=self.ignore_cert_errors, + proxy=proxy or self.proxy) self._websocket_url = self._chrome_instance.start() def stop(self): - self._chrome_instance.stop() - self._work_dir.cleanup() + if self._chrome_instance: + self._chrome_instance.stop() + self._chrome_instance = None + if self._work_dir: + self._work_dir.cleanup() + self._work_dir = None def abort_browse_page(self): self._abort_browse_page = True def browse_page(self, url, on_request=None, on_screenshot=None): - """Synchronously loads a page, takes a screenshot, and runs behaviors. + """Synchronously loads a page, takes a screenshot, and runs behaviors. Raises BrowsingException if browsing the page fails in a non-critical way. @@ -165,7 +169,7 @@ class Browser: return True elif not self._waiting_on_outlinks_msg_id: self.logger.info("finished browsing page according to behavior, retrieving outlinks url={}".format(self.url)) - self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate", + self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate", params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"}) return False elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS: @@ -250,12 +254,12 @@ class Browser: class Chrome: logger = logging.getLogger(__module__ + "." + __qualname__) - def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False): + def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False): self.port = port self.executable = executable self.user_home_dir = user_home_dir self.user_data_dir = user_data_dir - self.proxy_server = proxy_server + self.proxy = proxy self.ignore_cert_errors = ignore_cert_errors # returns websocket url to chrome window with about:blank loaded @@ -281,8 +285,8 @@ class Chrome: "--disable-web-security"] if self.ignore_cert_errors: chrome_args.append("--ignore-certificate-errors") - if self.proxy_server: - chrome_args.append("--proxy-server={}".format(self.proxy_server)) + if self.proxy: + chrome_args.append("--proxy-server={}".format(self.proxy)) chrome_args.append("about:blank") self.logger.info("running: {}".format(" ".join(chrome_args))) self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True) diff --git a/brozzler/site.py b/brozzler/site.py index 8566eb6..f9678a4 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -4,46 +4,8 @@ import surt import json import logging import urllib.robotparser -import urllib.request - -def robots_url(url): - hurl = surt.handyurl.parse(url) - hurl.path = "/robots.txt" - hurl.query = None - hurl.hash = None - return hurl.geturl() - -class RobotFileParser(urllib.robotparser.RobotFileParser): - """Adds support for fetching robots.txt through a proxy to - urllib.robotparser.RobotFileParser.""" - - logger = logging.getLogger(__module__ + "." + __qualname__) - - def __init__(self, url="", proxy=None): - super(RobotFileParser, self).__init__(url) - self.proxy = proxy - - def read(self): - """Reads the robots.txt URL, perhaps through the configured proxy, and - feeds it to the parser.""" - try: - request = urllib.request.Request(self.url) - if self.proxy: - request.set_proxy(self.proxy, request.type) - f = urllib.request.urlopen(request) - except urllib.error.HTTPError as err: - if err.code in (401, 403): - self.logger.info("{} returned {}, disallowing all".format(self.url, err.code)) - self.disallow_all = True - elif err.code >= 400: - self.logger.info("{} returned {}, allowing all".format(self.url, err.code)) - self.allow_all = True - except BaseException as err: - self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True) - self.disallow_all = True - else: - raw = f.read() - self.parse(raw.decode("utf-8").splitlines()) +import requests +import reppy.cache class Site: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -62,10 +24,15 @@ class Site: else: self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) - self._robots_cache = {} # {robots_url:RobotFileParser,...} + req_sesh = requests.Session() + req_sesh.verify = False # ignore cert errors + if proxy: + proxie = "http://{}".format(proxy) + req_sesh.proxies = {"http":proxie,"https":proxie} + self._robots_cache = reppy.cache.RobotsCache(session=req_sesh) def is_permitted_by_robots(self, url): - return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url) + return self.ignore_robots or self._robots_cache.allowed(url, "brozzler") def is_in_scope(self, url): try: @@ -85,15 +52,6 @@ class Site: def to_json(self): return json.dumps(self.to_dict(), separators=(',', ':')) - def _robots(self, robots_url): - if not robots_url in self._robots_cache: - robots_txt = RobotFileParser(robots_url, self.proxy) - self.logger.info("fetching {}".format(robots_url)) - robots_txt.read() - self._robots_cache[robots_url] = robots_txt - - return self._robots_cache[robots_url] - class CrawlUrl: def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None): self.id = id diff --git a/brozzler/worker.py b/brozzler/worker.py index e913731..540accd 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -16,21 +16,14 @@ class BrozzlerWorker: logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f", - max_browsers=1, chrome_exe="chromium-browser", - proxy_server=None, ignore_cert_errors=False, - enable_warcprox_features=False): - + max_browsers=1, chrome_exe="chromium-browser"): self._amqp_url = amqp_url self._max_browsers = max_browsers - self._proxy_server = proxy_server - self._enable_warcprox_features = enable_warcprox_features - self._browser_pool = brozzler.browser.BrowserPool(max_browsers, - chrome_exe=chrome_exe, proxy_server=proxy_server, - ignore_cert_errors=ignore_cert_errors) - + chrome_exe=chrome_exe, ignore_cert_errors=True) self._shutdown_requested = threading.Event() + def _youtube_dl(self, site): ydl_opts = { "outtmpl": "/dev/null", "verbose": False, @@ -42,13 +35,13 @@ class BrozzlerWorker: "nopart": True, "no_color": True, } - if self._proxy_server: - ydl_opts["proxy"] = "http://{}".format(self._proxy_server) + if site.proxy: + ydl_opts["proxy"] = "http://{}".format(site.proxy) ## XXX (sometimes?) causes chrome debug websocket to go through ## proxy. Maybe not needed thanks to hls_prefer_native. ## # see https://github.com/rg3/youtube-dl/issues/6087 - ## os.environ["http_proxy"] = "http://{}".format(self._proxy_server) - self._ydl = youtube_dl.YoutubeDL(ydl_opts) + ## os.environ["http_proxy"] = "http://{}".format(site.proxy) + return youtube_dl.YoutubeDL(ydl_opts) def _next_url(self, site): """Raises kombu.simple.Empty if queue is empty""" @@ -77,15 +70,15 @@ class BrozzlerWorker: logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name)) q.put(crawl_url.to_dict()) - def _putmeta(self, url, content_type, payload): - assert self._enable_warcprox_features - request = urllib.request.Request(url, method="PUTMETA", + def _putmeta(self, warcprox_address, url, content_type, payload): + request = urllib.request.Request(url, method="PUTMETA", headers={"Content-Type":content_type}, data=payload) - - # XXX evil hack to keep urllib from trying to tunnel https urls here + + # XXX setting request.type="http" is a hack to stop urllib from trying + # to tunnel if url is https request.type = "http" - request.set_proxy("localhost:8000", "http") - + request.set_proxy(warcprox_address, "http") + try: with urllib.request.urlopen(request) as response: if response.status != 204: @@ -93,14 +86,14 @@ class BrozzlerWorker: except urllib.error.HTTPError as e: logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info())) - def _try_youtube_dl(self, site, crawl_url): + def _try_youtube_dl(self, ydl, site, crawl_url): try: logging.info("trying youtube-dl on {}".format(crawl_url)) - info = self._ydl.extract_info(crawl_url.url) - if self._proxy_server and self._enable_warcprox_features: + info = ydl.extract_info(crawl_url.url) + if site.proxy and site.enable_warcprox_features: info_json = json.dumps(info, sort_keys=True, indent=4) logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url)) - self._putmeta(url=crawl_url.url, + self._putmeta(warcprox_address, site.proxy, url=crawl_url.url, content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8")) except BaseException as e: @@ -110,32 +103,34 @@ class BrozzlerWorker: raise def _on_screenshot(self, site, crawl_url, screenshot_png): - if self._proxy_server and self._enable_warcprox_features: + if site.proxy and site.enable_warcprox_features: logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url)) - self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png) + self._putmeta(warcprox_address=site.proxy, url=crawl_url.url, + content_type="image/png", payload=screenshot_png) - def _brozzle_site(self, browser, site): + def _brozzle_site(self, browser, ydl, site): start = time.time() crawl_url = None try: - with browser: - while not self._shutdown_requested.is_set() and time.time() - start < 60: - try: - crawl_url = self._next_url(site) - logging.info("crawling {}".format(crawl_url)) - self._try_youtube_dl(site, crawl_url) - crawl_url.outlinks = browser.browse_page(crawl_url.url, - on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png)) - self._completed_url(site, crawl_url) - crawl_url = None - except kombu.simple.Empty: - # if some timeout reached, re-raise? - pass + browser.start(proxy=site.proxy) + while not self._shutdown_requested.is_set() and time.time() - start < 60: + try: + crawl_url = self._next_url(site) + logging.info("crawling {}".format(crawl_url)) + self._try_youtube_dl(ydl, site, crawl_url) + crawl_url.outlinks = browser.browse_page(crawl_url.url, + on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png)) + self._completed_url(site, crawl_url) + crawl_url = None + except kombu.simple.Empty: + # if some timeout reached, re-raise? + pass # except kombu.simple.Empty: # logging.info("finished {} (queue is empty)".format(site)) except brozzler.browser.BrowsingAborted: logging.info("{} shut down".format(browser)) finally: + browser.stop() self._disclaim_site(site, crawl_url) self._browser_pool.release(browser) @@ -153,7 +148,8 @@ class BrozzlerWorker: site = brozzler.Site(**msg.payload) msg.ack() # XXX ack only after browsing finished? kinda complicated logging.info("browsing site {}".format(site)) - th = threading.Thread(target=lambda: self._brozzle_site(browser, site), + ydl = self._youtube_dl(site) + th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site), name="BrowsingThread-{}".format(site.scope_surt)) th.start() except kombu.simple.Empty: @@ -164,7 +160,7 @@ class BrozzlerWorker: latest_state = "browsers-busy" else: q_empty = True - + if q_empty: if latest_state != "no-unclaimed-sites": logging.info("no unclaimed sites to browse") diff --git a/requirements.txt b/requirements.txt index 065a8c6..db6f042 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ argparse PyYAML git+https://github.com/ikreymer/surt.git@py3 youtube_dl +git+https://github.com/seomoz/reppy.git