honor site proxy setting; remove brozzler-worker options that are now configured at the site level (and in the case of ignore_cert_errors, always on, no longer an option); use "reppy" library for robots.txt handling; fix some bugs

This commit is contained in:
Noah Levitt 2015-07-16 17:19:12 -07:00
parent e04247c3f7
commit 140a441eb5
5 changed files with 69 additions and 119 deletions

View File

@ -22,12 +22,6 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi
help='executable to use to invoke chrome') help='executable to use to invoke chrome')
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1', arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
help='max number of chrome instances simultaneously browsing pages') help='max number of chrome instances simultaneously browsing pages')
arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
help='configure browser to use specified proxy server')
arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
action='store_true', help='configure browser to ignore certificate errors')
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help='enable special features that assume the configured proxy is warcprox')
arg_parser.add_argument('-v', '--verbose', dest='log_level', arg_parser.add_argument('-v', '--verbose', dest='log_level',
action="store_const", default=logging.INFO, const=logging.DEBUG) action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument('--version', action='version', arg_parser.add_argument('--version', action='version',
@ -58,10 +52,7 @@ signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint) signal.signal(signal.SIGINT, sigint)
worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url, worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url,
max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
proxy_server=args.proxy_server,
ignore_cert_errors=args.ignore_cert_errors,
enable_warcprox_features=args.enable_warcprox_features)
worker.start() worker.start()

View File

@ -67,11 +67,11 @@ class Browser:
HARD_TIMEOUT_SECONDS = 20 * 60 HARD_TIMEOUT_SECONDS = 20 * 60
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False): def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
self.command_id = itertools.count(1) self.command_id = itertools.count(1)
self.chrome_port = chrome_port self.chrome_port = chrome_port
self.chrome_exe = chrome_exe self.chrome_exe = chrome_exe
self.proxy_server = proxy_server self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors self.ignore_cert_errors = ignore_cert_errors
self._behavior = None self._behavior = None
self._websock = None self._websock = None
@ -88,20 +88,24 @@ class Browser:
def __exit__(self, *args): def __exit__(self, *args):
self.stop() self.stop()
def start(self): def start(self, proxy=None):
# these can raise exceptions # these can raise exceptions
self._work_dir = tempfile.TemporaryDirectory() self._work_dir = tempfile.TemporaryDirectory()
self._chrome_instance = Chrome(port=self.chrome_port, self._chrome_instance = Chrome(port=self.chrome_port,
executable=self.chrome_exe, executable=self.chrome_exe,
user_home_dir=self._work_dir.name, user_home_dir=self._work_dir.name,
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]), user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
proxy_server=self.proxy_server, ignore_cert_errors=self.ignore_cert_errors,
ignore_cert_errors=self.ignore_cert_errors) proxy=proxy or self.proxy)
self._websocket_url = self._chrome_instance.start() self._websocket_url = self._chrome_instance.start()
def stop(self): def stop(self):
self._chrome_instance.stop() if self._chrome_instance:
self._work_dir.cleanup() self._chrome_instance.stop()
self._chrome_instance = None
if self._work_dir:
self._work_dir.cleanup()
self._work_dir = None
def abort_browse_page(self): def abort_browse_page(self):
self._abort_browse_page = True self._abort_browse_page = True
@ -250,12 +254,12 @@ class Browser:
class Chrome: class Chrome:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False): def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
self.port = port self.port = port
self.executable = executable self.executable = executable
self.user_home_dir = user_home_dir self.user_home_dir = user_home_dir
self.user_data_dir = user_data_dir self.user_data_dir = user_data_dir
self.proxy_server = proxy_server self.proxy = proxy
self.ignore_cert_errors = ignore_cert_errors self.ignore_cert_errors = ignore_cert_errors
# returns websocket url to chrome window with about:blank loaded # returns websocket url to chrome window with about:blank loaded
@ -281,8 +285,8 @@ class Chrome:
"--disable-web-security"] "--disable-web-security"]
if self.ignore_cert_errors: if self.ignore_cert_errors:
chrome_args.append("--ignore-certificate-errors") chrome_args.append("--ignore-certificate-errors")
if self.proxy_server: if self.proxy:
chrome_args.append("--proxy-server={}".format(self.proxy_server)) chrome_args.append("--proxy-server={}".format(self.proxy))
chrome_args.append("about:blank") chrome_args.append("about:blank")
self.logger.info("running: {}".format(" ".join(chrome_args))) self.logger.info("running: {}".format(" ".join(chrome_args)))
self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True) self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)

View File

@ -4,46 +4,8 @@ import surt
import json import json
import logging import logging
import urllib.robotparser import urllib.robotparser
import urllib.request import requests
import reppy.cache
def robots_url(url):
hurl = surt.handyurl.parse(url)
hurl.path = "/robots.txt"
hurl.query = None
hurl.hash = None
return hurl.geturl()
class RobotFileParser(urllib.robotparser.RobotFileParser):
"""Adds support for fetching robots.txt through a proxy to
urllib.robotparser.RobotFileParser."""
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, url="", proxy=None):
super(RobotFileParser, self).__init__(url)
self.proxy = proxy
def read(self):
"""Reads the robots.txt URL, perhaps through the configured proxy, and
feeds it to the parser."""
try:
request = urllib.request.Request(self.url)
if self.proxy:
request.set_proxy(self.proxy, request.type)
f = urllib.request.urlopen(request)
except urllib.error.HTTPError as err:
if err.code in (401, 403):
self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
self.disallow_all = True
elif err.code >= 400:
self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
self.allow_all = True
except BaseException as err:
self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
self.disallow_all = True
else:
raw = f.read()
self.parse(raw.decode("utf-8").splitlines())
class Site: class Site:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -62,10 +24,15 @@ class Site:
else: else:
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
self._robots_cache = {} # {robots_url:RobotFileParser,...} req_sesh = requests.Session()
req_sesh.verify = False # ignore cert errors
if proxy:
proxie = "http://{}".format(proxy)
req_sesh.proxies = {"http":proxie,"https":proxie}
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
def is_permitted_by_robots(self, url): def is_permitted_by_robots(self, url):
return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url) return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
def is_in_scope(self, url): def is_in_scope(self, url):
try: try:
@ -85,15 +52,6 @@ class Site:
def to_json(self): def to_json(self):
return json.dumps(self.to_dict(), separators=(',', ':')) return json.dumps(self.to_dict(), separators=(',', ':'))
def _robots(self, robots_url):
if not robots_url in self._robots_cache:
robots_txt = RobotFileParser(robots_url, self.proxy)
self.logger.info("fetching {}".format(robots_url))
robots_txt.read()
self._robots_cache[robots_url] = robots_txt
return self._robots_cache[robots_url]
class CrawlUrl: class CrawlUrl:
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None): def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
self.id = id self.id = id

View File

@ -16,21 +16,14 @@ class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f", def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f",
max_browsers=1, chrome_exe="chromium-browser", max_browsers=1, chrome_exe="chromium-browser"):
proxy_server=None, ignore_cert_errors=False,
enable_warcprox_features=False):
self._amqp_url = amqp_url self._amqp_url = amqp_url
self._max_browsers = max_browsers self._max_browsers = max_browsers
self._proxy_server = proxy_server
self._enable_warcprox_features = enable_warcprox_features
self._browser_pool = brozzler.browser.BrowserPool(max_browsers, self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
chrome_exe=chrome_exe, proxy_server=proxy_server, chrome_exe=chrome_exe, ignore_cert_errors=True)
ignore_cert_errors=ignore_cert_errors)
self._shutdown_requested = threading.Event() self._shutdown_requested = threading.Event()
def _youtube_dl(self, site):
ydl_opts = { ydl_opts = {
"outtmpl": "/dev/null", "outtmpl": "/dev/null",
"verbose": False, "verbose": False,
@ -42,13 +35,13 @@ class BrozzlerWorker:
"nopart": True, "nopart": True,
"no_color": True, "no_color": True,
} }
if self._proxy_server: if site.proxy:
ydl_opts["proxy"] = "http://{}".format(self._proxy_server) ydl_opts["proxy"] = "http://{}".format(site.proxy)
## XXX (sometimes?) causes chrome debug websocket to go through ## XXX (sometimes?) causes chrome debug websocket to go through
## proxy. Maybe not needed thanks to hls_prefer_native. ## proxy. Maybe not needed thanks to hls_prefer_native.
## # see https://github.com/rg3/youtube-dl/issues/6087 ## # see https://github.com/rg3/youtube-dl/issues/6087
## os.environ["http_proxy"] = "http://{}".format(self._proxy_server) ## os.environ["http_proxy"] = "http://{}".format(site.proxy)
self._ydl = youtube_dl.YoutubeDL(ydl_opts) return youtube_dl.YoutubeDL(ydl_opts)
def _next_url(self, site): def _next_url(self, site):
"""Raises kombu.simple.Empty if queue is empty""" """Raises kombu.simple.Empty if queue is empty"""
@ -77,14 +70,14 @@ class BrozzlerWorker:
logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name)) logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name))
q.put(crawl_url.to_dict()) q.put(crawl_url.to_dict())
def _putmeta(self, url, content_type, payload): def _putmeta(self, warcprox_address, url, content_type, payload):
assert self._enable_warcprox_features
request = urllib.request.Request(url, method="PUTMETA", request = urllib.request.Request(url, method="PUTMETA",
headers={"Content-Type":content_type}, data=payload) headers={"Content-Type":content_type}, data=payload)
# XXX evil hack to keep urllib from trying to tunnel https urls here # XXX setting request.type="http" is a hack to stop urllib from trying
# to tunnel if url is https
request.type = "http" request.type = "http"
request.set_proxy("localhost:8000", "http") request.set_proxy(warcprox_address, "http")
try: try:
with urllib.request.urlopen(request) as response: with urllib.request.urlopen(request) as response:
@ -93,14 +86,14 @@ class BrozzlerWorker:
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info())) logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))
def _try_youtube_dl(self, site, crawl_url): def _try_youtube_dl(self, ydl, site, crawl_url):
try: try:
logging.info("trying youtube-dl on {}".format(crawl_url)) logging.info("trying youtube-dl on {}".format(crawl_url))
info = self._ydl.extract_info(crawl_url.url) info = ydl.extract_info(crawl_url.url)
if self._proxy_server and self._enable_warcprox_features: if site.proxy and site.enable_warcprox_features:
info_json = json.dumps(info, sort_keys=True, indent=4) info_json = json.dumps(info, sort_keys=True, indent=4)
logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url)) logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
self._putmeta(url=crawl_url.url, self._putmeta(warcprox_address, site.proxy, url=crawl_url.url,
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8")) payload=info_json.encode("utf-8"))
except BaseException as e: except BaseException as e:
@ -110,32 +103,34 @@ class BrozzlerWorker:
raise raise
def _on_screenshot(self, site, crawl_url, screenshot_png): def _on_screenshot(self, site, crawl_url, screenshot_png):
if self._proxy_server and self._enable_warcprox_features: if site.proxy and site.enable_warcprox_features:
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url)) logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png) self._putmeta(warcprox_address=site.proxy, url=crawl_url.url,
content_type="image/png", payload=screenshot_png)
def _brozzle_site(self, browser, site): def _brozzle_site(self, browser, ydl, site):
start = time.time() start = time.time()
crawl_url = None crawl_url = None
try: try:
with browser: browser.start(proxy=site.proxy)
while not self._shutdown_requested.is_set() and time.time() - start < 60: while not self._shutdown_requested.is_set() and time.time() - start < 60:
try: try:
crawl_url = self._next_url(site) crawl_url = self._next_url(site)
logging.info("crawling {}".format(crawl_url)) logging.info("crawling {}".format(crawl_url))
self._try_youtube_dl(site, crawl_url) self._try_youtube_dl(ydl, site, crawl_url)
crawl_url.outlinks = browser.browse_page(crawl_url.url, crawl_url.outlinks = browser.browse_page(crawl_url.url,
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png)) on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
self._completed_url(site, crawl_url) self._completed_url(site, crawl_url)
crawl_url = None crawl_url = None
except kombu.simple.Empty: except kombu.simple.Empty:
# if some timeout reached, re-raise? # if some timeout reached, re-raise?
pass pass
# except kombu.simple.Empty: # except kombu.simple.Empty:
# logging.info("finished {} (queue is empty)".format(site)) # logging.info("finished {} (queue is empty)".format(site))
except brozzler.browser.BrowsingAborted: except brozzler.browser.BrowsingAborted:
logging.info("{} shut down".format(browser)) logging.info("{} shut down".format(browser))
finally: finally:
browser.stop()
self._disclaim_site(site, crawl_url) self._disclaim_site(site, crawl_url)
self._browser_pool.release(browser) self._browser_pool.release(browser)
@ -153,7 +148,8 @@ class BrozzlerWorker:
site = brozzler.Site(**msg.payload) site = brozzler.Site(**msg.payload)
msg.ack() # XXX ack only after browsing finished? kinda complicated msg.ack() # XXX ack only after browsing finished? kinda complicated
logging.info("browsing site {}".format(site)) logging.info("browsing site {}".format(site))
th = threading.Thread(target=lambda: self._brozzle_site(browser, site), ydl = self._youtube_dl(site)
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
name="BrowsingThread-{}".format(site.scope_surt)) name="BrowsingThread-{}".format(site.scope_surt))
th.start() th.start()
except kombu.simple.Empty: except kombu.simple.Empty:

View File

@ -4,3 +4,4 @@ argparse
PyYAML PyYAML
git+https://github.com/ikreymer/surt.git@py3 git+https://github.com/ikreymer/surt.git@py3
youtube_dl youtube_dl
git+https://github.com/seomoz/reppy.git