mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
honor site proxy setting; remove brozzler-worker options that are now configured at the site level (and in the case of ignore_cert_errors, always on, no longer an option); use "reppy" library for robots.txt handling; fix some bugs
This commit is contained in:
parent
e04247c3f7
commit
140a441eb5
@ -22,12 +22,6 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi
|
|||||||
help='executable to use to invoke chrome')
|
help='executable to use to invoke chrome')
|
||||||
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
||||||
help='max number of chrome instances simultaneously browsing pages')
|
help='max number of chrome instances simultaneously browsing pages')
|
||||||
arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
|
|
||||||
help='configure browser to use specified proxy server')
|
|
||||||
arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
|
|
||||||
action='store_true', help='configure browser to ignore certificate errors')
|
|
||||||
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
|
|
||||||
action='store_true', help='enable special features that assume the configured proxy is warcprox')
|
|
||||||
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
||||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||||
arg_parser.add_argument('--version', action='version',
|
arg_parser.add_argument('--version', action='version',
|
||||||
@ -58,10 +52,7 @@ signal.signal(signal.SIGTERM, sigterm)
|
|||||||
signal.signal(signal.SIGINT, sigint)
|
signal.signal(signal.SIGINT, sigint)
|
||||||
|
|
||||||
worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url,
|
worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url,
|
||||||
max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe,
|
max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
||||||
proxy_server=args.proxy_server,
|
|
||||||
ignore_cert_errors=args.ignore_cert_errors,
|
|
||||||
enable_warcprox_features=args.enable_warcprox_features)
|
|
||||||
|
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|
||||||
|
@ -67,11 +67,11 @@ class Browser:
|
|||||||
|
|
||||||
HARD_TIMEOUT_SECONDS = 20 * 60
|
HARD_TIMEOUT_SECONDS = 20 * 60
|
||||||
|
|
||||||
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False):
|
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
|
||||||
self.command_id = itertools.count(1)
|
self.command_id = itertools.count(1)
|
||||||
self.chrome_port = chrome_port
|
self.chrome_port = chrome_port
|
||||||
self.chrome_exe = chrome_exe
|
self.chrome_exe = chrome_exe
|
||||||
self.proxy_server = proxy_server
|
self.proxy = proxy
|
||||||
self.ignore_cert_errors = ignore_cert_errors
|
self.ignore_cert_errors = ignore_cert_errors
|
||||||
self._behavior = None
|
self._behavior = None
|
||||||
self._websock = None
|
self._websock = None
|
||||||
@ -88,20 +88,24 @@ class Browser:
|
|||||||
def __exit__(self, *args):
|
def __exit__(self, *args):
|
||||||
self.stop()
|
self.stop()
|
||||||
|
|
||||||
def start(self):
|
def start(self, proxy=None):
|
||||||
# these can raise exceptions
|
# these can raise exceptions
|
||||||
self._work_dir = tempfile.TemporaryDirectory()
|
self._work_dir = tempfile.TemporaryDirectory()
|
||||||
self._chrome_instance = Chrome(port=self.chrome_port,
|
self._chrome_instance = Chrome(port=self.chrome_port,
|
||||||
executable=self.chrome_exe,
|
executable=self.chrome_exe,
|
||||||
user_home_dir=self._work_dir.name,
|
user_home_dir=self._work_dir.name,
|
||||||
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
||||||
proxy_server=self.proxy_server,
|
ignore_cert_errors=self.ignore_cert_errors,
|
||||||
ignore_cert_errors=self.ignore_cert_errors)
|
proxy=proxy or self.proxy)
|
||||||
self._websocket_url = self._chrome_instance.start()
|
self._websocket_url = self._chrome_instance.start()
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
self._chrome_instance.stop()
|
if self._chrome_instance:
|
||||||
self._work_dir.cleanup()
|
self._chrome_instance.stop()
|
||||||
|
self._chrome_instance = None
|
||||||
|
if self._work_dir:
|
||||||
|
self._work_dir.cleanup()
|
||||||
|
self._work_dir = None
|
||||||
|
|
||||||
def abort_browse_page(self):
|
def abort_browse_page(self):
|
||||||
self._abort_browse_page = True
|
self._abort_browse_page = True
|
||||||
@ -250,12 +254,12 @@ class Browser:
|
|||||||
class Chrome:
|
class Chrome:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False):
|
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
|
||||||
self.port = port
|
self.port = port
|
||||||
self.executable = executable
|
self.executable = executable
|
||||||
self.user_home_dir = user_home_dir
|
self.user_home_dir = user_home_dir
|
||||||
self.user_data_dir = user_data_dir
|
self.user_data_dir = user_data_dir
|
||||||
self.proxy_server = proxy_server
|
self.proxy = proxy
|
||||||
self.ignore_cert_errors = ignore_cert_errors
|
self.ignore_cert_errors = ignore_cert_errors
|
||||||
|
|
||||||
# returns websocket url to chrome window with about:blank loaded
|
# returns websocket url to chrome window with about:blank loaded
|
||||||
@ -281,8 +285,8 @@ class Chrome:
|
|||||||
"--disable-web-security"]
|
"--disable-web-security"]
|
||||||
if self.ignore_cert_errors:
|
if self.ignore_cert_errors:
|
||||||
chrome_args.append("--ignore-certificate-errors")
|
chrome_args.append("--ignore-certificate-errors")
|
||||||
if self.proxy_server:
|
if self.proxy:
|
||||||
chrome_args.append("--proxy-server={}".format(self.proxy_server))
|
chrome_args.append("--proxy-server={}".format(self.proxy))
|
||||||
chrome_args.append("about:blank")
|
chrome_args.append("about:blank")
|
||||||
self.logger.info("running: {}".format(" ".join(chrome_args)))
|
self.logger.info("running: {}".format(" ".join(chrome_args)))
|
||||||
self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
|
self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
|
||||||
|
@ -4,46 +4,8 @@ import surt
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import urllib.robotparser
|
import urllib.robotparser
|
||||||
import urllib.request
|
import requests
|
||||||
|
import reppy.cache
|
||||||
def robots_url(url):
|
|
||||||
hurl = surt.handyurl.parse(url)
|
|
||||||
hurl.path = "/robots.txt"
|
|
||||||
hurl.query = None
|
|
||||||
hurl.hash = None
|
|
||||||
return hurl.geturl()
|
|
||||||
|
|
||||||
class RobotFileParser(urllib.robotparser.RobotFileParser):
|
|
||||||
"""Adds support for fetching robots.txt through a proxy to
|
|
||||||
urllib.robotparser.RobotFileParser."""
|
|
||||||
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
|
||||||
|
|
||||||
def __init__(self, url="", proxy=None):
|
|
||||||
super(RobotFileParser, self).__init__(url)
|
|
||||||
self.proxy = proxy
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
"""Reads the robots.txt URL, perhaps through the configured proxy, and
|
|
||||||
feeds it to the parser."""
|
|
||||||
try:
|
|
||||||
request = urllib.request.Request(self.url)
|
|
||||||
if self.proxy:
|
|
||||||
request.set_proxy(self.proxy, request.type)
|
|
||||||
f = urllib.request.urlopen(request)
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code in (401, 403):
|
|
||||||
self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
|
|
||||||
self.disallow_all = True
|
|
||||||
elif err.code >= 400:
|
|
||||||
self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
|
|
||||||
self.allow_all = True
|
|
||||||
except BaseException as err:
|
|
||||||
self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
|
|
||||||
self.disallow_all = True
|
|
||||||
else:
|
|
||||||
raw = f.read()
|
|
||||||
self.parse(raw.decode("utf-8").splitlines())
|
|
||||||
|
|
||||||
class Site:
|
class Site:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
@ -62,10 +24,15 @@ class Site:
|
|||||||
else:
|
else:
|
||||||
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
|
||||||
self._robots_cache = {} # {robots_url:RobotFileParser,...}
|
req_sesh = requests.Session()
|
||||||
|
req_sesh.verify = False # ignore cert errors
|
||||||
|
if proxy:
|
||||||
|
proxie = "http://{}".format(proxy)
|
||||||
|
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||||
|
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
|
||||||
|
|
||||||
def is_permitted_by_robots(self, url):
|
def is_permitted_by_robots(self, url):
|
||||||
return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
|
return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
|
||||||
|
|
||||||
def is_in_scope(self, url):
|
def is_in_scope(self, url):
|
||||||
try:
|
try:
|
||||||
@ -85,15 +52,6 @@ class Site:
|
|||||||
def to_json(self):
|
def to_json(self):
|
||||||
return json.dumps(self.to_dict(), separators=(',', ':'))
|
return json.dumps(self.to_dict(), separators=(',', ':'))
|
||||||
|
|
||||||
def _robots(self, robots_url):
|
|
||||||
if not robots_url in self._robots_cache:
|
|
||||||
robots_txt = RobotFileParser(robots_url, self.proxy)
|
|
||||||
self.logger.info("fetching {}".format(robots_url))
|
|
||||||
robots_txt.read()
|
|
||||||
self._robots_cache[robots_url] = robots_txt
|
|
||||||
|
|
||||||
return self._robots_cache[robots_url]
|
|
||||||
|
|
||||||
class CrawlUrl:
|
class CrawlUrl:
|
||||||
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
|
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
|
||||||
self.id = id
|
self.id = id
|
||||||
|
@ -16,21 +16,14 @@ class BrozzlerWorker:
|
|||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f",
|
def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f",
|
||||||
max_browsers=1, chrome_exe="chromium-browser",
|
max_browsers=1, chrome_exe="chromium-browser"):
|
||||||
proxy_server=None, ignore_cert_errors=False,
|
|
||||||
enable_warcprox_features=False):
|
|
||||||
|
|
||||||
self._amqp_url = amqp_url
|
self._amqp_url = amqp_url
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
self._proxy_server = proxy_server
|
|
||||||
self._enable_warcprox_features = enable_warcprox_features
|
|
||||||
|
|
||||||
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
|
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
|
||||||
chrome_exe=chrome_exe, proxy_server=proxy_server,
|
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||||
ignore_cert_errors=ignore_cert_errors)
|
|
||||||
|
|
||||||
self._shutdown_requested = threading.Event()
|
self._shutdown_requested = threading.Event()
|
||||||
|
|
||||||
|
def _youtube_dl(self, site):
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"outtmpl": "/dev/null",
|
"outtmpl": "/dev/null",
|
||||||
"verbose": False,
|
"verbose": False,
|
||||||
@ -42,13 +35,13 @@ class BrozzlerWorker:
|
|||||||
"nopart": True,
|
"nopart": True,
|
||||||
"no_color": True,
|
"no_color": True,
|
||||||
}
|
}
|
||||||
if self._proxy_server:
|
if site.proxy:
|
||||||
ydl_opts["proxy"] = "http://{}".format(self._proxy_server)
|
ydl_opts["proxy"] = "http://{}".format(site.proxy)
|
||||||
## XXX (sometimes?) causes chrome debug websocket to go through
|
## XXX (sometimes?) causes chrome debug websocket to go through
|
||||||
## proxy. Maybe not needed thanks to hls_prefer_native.
|
## proxy. Maybe not needed thanks to hls_prefer_native.
|
||||||
## # see https://github.com/rg3/youtube-dl/issues/6087
|
## # see https://github.com/rg3/youtube-dl/issues/6087
|
||||||
## os.environ["http_proxy"] = "http://{}".format(self._proxy_server)
|
## os.environ["http_proxy"] = "http://{}".format(site.proxy)
|
||||||
self._ydl = youtube_dl.YoutubeDL(ydl_opts)
|
return youtube_dl.YoutubeDL(ydl_opts)
|
||||||
|
|
||||||
def _next_url(self, site):
|
def _next_url(self, site):
|
||||||
"""Raises kombu.simple.Empty if queue is empty"""
|
"""Raises kombu.simple.Empty if queue is empty"""
|
||||||
@ -77,14 +70,14 @@ class BrozzlerWorker:
|
|||||||
logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name))
|
logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name))
|
||||||
q.put(crawl_url.to_dict())
|
q.put(crawl_url.to_dict())
|
||||||
|
|
||||||
def _putmeta(self, url, content_type, payload):
|
def _putmeta(self, warcprox_address, url, content_type, payload):
|
||||||
assert self._enable_warcprox_features
|
|
||||||
request = urllib.request.Request(url, method="PUTMETA",
|
request = urllib.request.Request(url, method="PUTMETA",
|
||||||
headers={"Content-Type":content_type}, data=payload)
|
headers={"Content-Type":content_type}, data=payload)
|
||||||
|
|
||||||
# XXX evil hack to keep urllib from trying to tunnel https urls here
|
# XXX setting request.type="http" is a hack to stop urllib from trying
|
||||||
|
# to tunnel if url is https
|
||||||
request.type = "http"
|
request.type = "http"
|
||||||
request.set_proxy("localhost:8000", "http")
|
request.set_proxy(warcprox_address, "http")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with urllib.request.urlopen(request) as response:
|
with urllib.request.urlopen(request) as response:
|
||||||
@ -93,14 +86,14 @@ class BrozzlerWorker:
|
|||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))
|
logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))
|
||||||
|
|
||||||
def _try_youtube_dl(self, site, crawl_url):
|
def _try_youtube_dl(self, ydl, site, crawl_url):
|
||||||
try:
|
try:
|
||||||
logging.info("trying youtube-dl on {}".format(crawl_url))
|
logging.info("trying youtube-dl on {}".format(crawl_url))
|
||||||
info = self._ydl.extract_info(crawl_url.url)
|
info = ydl.extract_info(crawl_url.url)
|
||||||
if self._proxy_server and self._enable_warcprox_features:
|
if site.proxy and site.enable_warcprox_features:
|
||||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||||
logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
|
logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
|
||||||
self._putmeta(url=crawl_url.url,
|
self._putmeta(warcprox_address, site.proxy, url=crawl_url.url,
|
||||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||||
payload=info_json.encode("utf-8"))
|
payload=info_json.encode("utf-8"))
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
@ -110,32 +103,34 @@ class BrozzlerWorker:
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
def _on_screenshot(self, site, crawl_url, screenshot_png):
|
def _on_screenshot(self, site, crawl_url, screenshot_png):
|
||||||
if self._proxy_server and self._enable_warcprox_features:
|
if site.proxy and site.enable_warcprox_features:
|
||||||
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
|
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
|
||||||
self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
|
self._putmeta(warcprox_address=site.proxy, url=crawl_url.url,
|
||||||
|
content_type="image/png", payload=screenshot_png)
|
||||||
|
|
||||||
def _brozzle_site(self, browser, site):
|
def _brozzle_site(self, browser, ydl, site):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
crawl_url = None
|
crawl_url = None
|
||||||
try:
|
try:
|
||||||
with browser:
|
browser.start(proxy=site.proxy)
|
||||||
while not self._shutdown_requested.is_set() and time.time() - start < 60:
|
while not self._shutdown_requested.is_set() and time.time() - start < 60:
|
||||||
try:
|
try:
|
||||||
crawl_url = self._next_url(site)
|
crawl_url = self._next_url(site)
|
||||||
logging.info("crawling {}".format(crawl_url))
|
logging.info("crawling {}".format(crawl_url))
|
||||||
self._try_youtube_dl(site, crawl_url)
|
self._try_youtube_dl(ydl, site, crawl_url)
|
||||||
crawl_url.outlinks = browser.browse_page(crawl_url.url,
|
crawl_url.outlinks = browser.browse_page(crawl_url.url,
|
||||||
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
|
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
|
||||||
self._completed_url(site, crawl_url)
|
self._completed_url(site, crawl_url)
|
||||||
crawl_url = None
|
crawl_url = None
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
# if some timeout reached, re-raise?
|
# if some timeout reached, re-raise?
|
||||||
pass
|
pass
|
||||||
# except kombu.simple.Empty:
|
# except kombu.simple.Empty:
|
||||||
# logging.info("finished {} (queue is empty)".format(site))
|
# logging.info("finished {} (queue is empty)".format(site))
|
||||||
except brozzler.browser.BrowsingAborted:
|
except brozzler.browser.BrowsingAborted:
|
||||||
logging.info("{} shut down".format(browser))
|
logging.info("{} shut down".format(browser))
|
||||||
finally:
|
finally:
|
||||||
|
browser.stop()
|
||||||
self._disclaim_site(site, crawl_url)
|
self._disclaim_site(site, crawl_url)
|
||||||
self._browser_pool.release(browser)
|
self._browser_pool.release(browser)
|
||||||
|
|
||||||
@ -153,7 +148,8 @@ class BrozzlerWorker:
|
|||||||
site = brozzler.Site(**msg.payload)
|
site = brozzler.Site(**msg.payload)
|
||||||
msg.ack() # XXX ack only after browsing finished? kinda complicated
|
msg.ack() # XXX ack only after browsing finished? kinda complicated
|
||||||
logging.info("browsing site {}".format(site))
|
logging.info("browsing site {}".format(site))
|
||||||
th = threading.Thread(target=lambda: self._brozzle_site(browser, site),
|
ydl = self._youtube_dl(site)
|
||||||
|
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
|
||||||
name="BrowsingThread-{}".format(site.scope_surt))
|
name="BrowsingThread-{}".format(site.scope_surt))
|
||||||
th.start()
|
th.start()
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
|
@ -4,3 +4,4 @@ argparse
|
|||||||
PyYAML
|
PyYAML
|
||||||
git+https://github.com/ikreymer/surt.git@py3
|
git+https://github.com/ikreymer/surt.git@py3
|
||||||
youtube_dl
|
youtube_dl
|
||||||
|
git+https://github.com/seomoz/reppy.git
|
||||||
|
Loading…
x
Reference in New Issue
Block a user