mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
honor site proxy setting; remove brozzler-worker options that are now configured at the site level (and in the case of ignore_cert_errors, always on, no longer an option); use "reppy" library for robots.txt handling; fix some bugs
This commit is contained in:
parent
e04247c3f7
commit
140a441eb5
@ -22,12 +22,6 @@ arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromi
|
||||
help='executable to use to invoke chrome')
|
||||
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
|
||||
help='max number of chrome instances simultaneously browsing pages')
|
||||
arg_parser.add_argument('--proxy-server', dest='proxy_server', default=None,
|
||||
help='configure browser to use specified proxy server')
|
||||
arg_parser.add_argument('--ignore-certificate-errors', dest='ignore_cert_errors',
|
||||
action='store_true', help='configure browser to ignore certificate errors')
|
||||
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
|
||||
action='store_true', help='enable special features that assume the configured proxy is warcprox')
|
||||
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
||||
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||
arg_parser.add_argument('--version', action='version',
|
||||
@ -58,10 +52,7 @@ signal.signal(signal.SIGTERM, sigterm)
|
||||
signal.signal(signal.SIGINT, sigint)
|
||||
|
||||
worker = brozzler.worker.BrozzlerWorker(amqp_url=args.amqp_url,
|
||||
max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe,
|
||||
proxy_server=args.proxy_server,
|
||||
ignore_cert_errors=args.ignore_cert_errors,
|
||||
enable_warcprox_features=args.enable_warcprox_features)
|
||||
max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe)
|
||||
|
||||
worker.start()
|
||||
|
||||
@ -70,9 +61,9 @@ try:
|
||||
time.sleep(0.5)
|
||||
except brozzler.ShutdownRequested as e:
|
||||
worker.shutdown_now()
|
||||
|
||||
|
||||
for th in threading.enumerate():
|
||||
if th != threading.current_thread():
|
||||
th.join()
|
||||
|
||||
|
||||
logging.info("all done, exiting")
|
||||
|
@ -67,11 +67,11 @@ class Browser:
|
||||
|
||||
HARD_TIMEOUT_SECONDS = 20 * 60
|
||||
|
||||
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy_server=None, ignore_cert_errors=False):
|
||||
def __init__(self, chrome_port=9222, chrome_exe='chromium-browser', proxy=None, ignore_cert_errors=False):
|
||||
self.command_id = itertools.count(1)
|
||||
self.chrome_port = chrome_port
|
||||
self.chrome_exe = chrome_exe
|
||||
self.proxy_server = proxy_server
|
||||
self.proxy = proxy
|
||||
self.ignore_cert_errors = ignore_cert_errors
|
||||
self._behavior = None
|
||||
self._websock = None
|
||||
@ -88,26 +88,30 @@ class Browser:
|
||||
def __exit__(self, *args):
|
||||
self.stop()
|
||||
|
||||
def start(self):
|
||||
def start(self, proxy=None):
|
||||
# these can raise exceptions
|
||||
self._work_dir = tempfile.TemporaryDirectory()
|
||||
self._chrome_instance = Chrome(port=self.chrome_port,
|
||||
executable=self.chrome_exe,
|
||||
user_home_dir=self._work_dir.name,
|
||||
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
||||
proxy_server=self.proxy_server,
|
||||
ignore_cert_errors=self.ignore_cert_errors)
|
||||
ignore_cert_errors=self.ignore_cert_errors,
|
||||
proxy=proxy or self.proxy)
|
||||
self._websocket_url = self._chrome_instance.start()
|
||||
|
||||
def stop(self):
|
||||
self._chrome_instance.stop()
|
||||
self._work_dir.cleanup()
|
||||
if self._chrome_instance:
|
||||
self._chrome_instance.stop()
|
||||
self._chrome_instance = None
|
||||
if self._work_dir:
|
||||
self._work_dir.cleanup()
|
||||
self._work_dir = None
|
||||
|
||||
def abort_browse_page(self):
|
||||
self._abort_browse_page = True
|
||||
|
||||
def browse_page(self, url, on_request=None, on_screenshot=None):
|
||||
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
||||
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
||||
|
||||
Raises BrowsingException if browsing the page fails in a non-critical
|
||||
way.
|
||||
@ -165,7 +169,7 @@ class Browser:
|
||||
return True
|
||||
elif not self._waiting_on_outlinks_msg_id:
|
||||
self.logger.info("finished browsing page according to behavior, retrieving outlinks url={}".format(self.url))
|
||||
self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate",
|
||||
self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate",
|
||||
params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
|
||||
return False
|
||||
elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
|
||||
@ -250,12 +254,12 @@ class Browser:
|
||||
class Chrome:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy_server=None, ignore_cert_errors=False):
|
||||
def __init__(self, port, executable, user_home_dir, user_data_dir, proxy=None, ignore_cert_errors=False):
|
||||
self.port = port
|
||||
self.executable = executable
|
||||
self.user_home_dir = user_home_dir
|
||||
self.user_data_dir = user_data_dir
|
||||
self.proxy_server = proxy_server
|
||||
self.proxy = proxy
|
||||
self.ignore_cert_errors = ignore_cert_errors
|
||||
|
||||
# returns websocket url to chrome window with about:blank loaded
|
||||
@ -281,8 +285,8 @@ class Chrome:
|
||||
"--disable-web-security"]
|
||||
if self.ignore_cert_errors:
|
||||
chrome_args.append("--ignore-certificate-errors")
|
||||
if self.proxy_server:
|
||||
chrome_args.append("--proxy-server={}".format(self.proxy_server))
|
||||
if self.proxy:
|
||||
chrome_args.append("--proxy-server={}".format(self.proxy))
|
||||
chrome_args.append("about:blank")
|
||||
self.logger.info("running: {}".format(" ".join(chrome_args)))
|
||||
self.chrome_process = subprocess.Popen(chrome_args, env=new_env, start_new_session=True)
|
||||
|
@ -4,46 +4,8 @@ import surt
|
||||
import json
|
||||
import logging
|
||||
import urllib.robotparser
|
||||
import urllib.request
|
||||
|
||||
def robots_url(url):
|
||||
hurl = surt.handyurl.parse(url)
|
||||
hurl.path = "/robots.txt"
|
||||
hurl.query = None
|
||||
hurl.hash = None
|
||||
return hurl.geturl()
|
||||
|
||||
class RobotFileParser(urllib.robotparser.RobotFileParser):
|
||||
"""Adds support for fetching robots.txt through a proxy to
|
||||
urllib.robotparser.RobotFileParser."""
|
||||
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, url="", proxy=None):
|
||||
super(RobotFileParser, self).__init__(url)
|
||||
self.proxy = proxy
|
||||
|
||||
def read(self):
|
||||
"""Reads the robots.txt URL, perhaps through the configured proxy, and
|
||||
feeds it to the parser."""
|
||||
try:
|
||||
request = urllib.request.Request(self.url)
|
||||
if self.proxy:
|
||||
request.set_proxy(self.proxy, request.type)
|
||||
f = urllib.request.urlopen(request)
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code in (401, 403):
|
||||
self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
|
||||
self.disallow_all = True
|
||||
elif err.code >= 400:
|
||||
self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
|
||||
self.allow_all = True
|
||||
except BaseException as err:
|
||||
self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
|
||||
self.disallow_all = True
|
||||
else:
|
||||
raw = f.read()
|
||||
self.parse(raw.decode("utf-8").splitlines())
|
||||
import requests
|
||||
import reppy.cache
|
||||
|
||||
class Site:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
@ -62,10 +24,15 @@ class Site:
|
||||
else:
|
||||
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||
|
||||
self._robots_cache = {} # {robots_url:RobotFileParser,...}
|
||||
req_sesh = requests.Session()
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
if proxy:
|
||||
proxie = "http://{}".format(proxy)
|
||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
|
||||
|
||||
def is_permitted_by_robots(self, url):
|
||||
return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
|
||||
return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
|
||||
|
||||
def is_in_scope(self, url):
|
||||
try:
|
||||
@ -85,15 +52,6 @@ class Site:
|
||||
def to_json(self):
|
||||
return json.dumps(self.to_dict(), separators=(',', ':'))
|
||||
|
||||
def _robots(self, robots_url):
|
||||
if not robots_url in self._robots_cache:
|
||||
robots_txt = RobotFileParser(robots_url, self.proxy)
|
||||
self.logger.info("fetching {}".format(robots_url))
|
||||
robots_txt.read()
|
||||
self._robots_cache[robots_url] = robots_txt
|
||||
|
||||
return self._robots_cache[robots_url]
|
||||
|
||||
class CrawlUrl:
|
||||
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
|
||||
self.id = id
|
||||
|
@ -16,21 +16,14 @@ class BrozzlerWorker:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, amqp_url="amqp://guest:guest@localhost:5672/%2f",
|
||||
max_browsers=1, chrome_exe="chromium-browser",
|
||||
proxy_server=None, ignore_cert_errors=False,
|
||||
enable_warcprox_features=False):
|
||||
|
||||
max_browsers=1, chrome_exe="chromium-browser"):
|
||||
self._amqp_url = amqp_url
|
||||
self._max_browsers = max_browsers
|
||||
self._proxy_server = proxy_server
|
||||
self._enable_warcprox_features = enable_warcprox_features
|
||||
|
||||
self._browser_pool = brozzler.browser.BrowserPool(max_browsers,
|
||||
chrome_exe=chrome_exe, proxy_server=proxy_server,
|
||||
ignore_cert_errors=ignore_cert_errors)
|
||||
|
||||
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||
self._shutdown_requested = threading.Event()
|
||||
|
||||
def _youtube_dl(self, site):
|
||||
ydl_opts = {
|
||||
"outtmpl": "/dev/null",
|
||||
"verbose": False,
|
||||
@ -42,13 +35,13 @@ class BrozzlerWorker:
|
||||
"nopart": True,
|
||||
"no_color": True,
|
||||
}
|
||||
if self._proxy_server:
|
||||
ydl_opts["proxy"] = "http://{}".format(self._proxy_server)
|
||||
if site.proxy:
|
||||
ydl_opts["proxy"] = "http://{}".format(site.proxy)
|
||||
## XXX (sometimes?) causes chrome debug websocket to go through
|
||||
## proxy. Maybe not needed thanks to hls_prefer_native.
|
||||
## # see https://github.com/rg3/youtube-dl/issues/6087
|
||||
## os.environ["http_proxy"] = "http://{}".format(self._proxy_server)
|
||||
self._ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
## os.environ["http_proxy"] = "http://{}".format(site.proxy)
|
||||
return youtube_dl.YoutubeDL(ydl_opts)
|
||||
|
||||
def _next_url(self, site):
|
||||
"""Raises kombu.simple.Empty if queue is empty"""
|
||||
@ -77,15 +70,15 @@ class BrozzlerWorker:
|
||||
logging.info("putting unfinished url {} on queue {}".format(crawl_url, q.queue.name))
|
||||
q.put(crawl_url.to_dict())
|
||||
|
||||
def _putmeta(self, url, content_type, payload):
|
||||
assert self._enable_warcprox_features
|
||||
request = urllib.request.Request(url, method="PUTMETA",
|
||||
def _putmeta(self, warcprox_address, url, content_type, payload):
|
||||
request = urllib.request.Request(url, method="PUTMETA",
|
||||
headers={"Content-Type":content_type}, data=payload)
|
||||
|
||||
# XXX evil hack to keep urllib from trying to tunnel https urls here
|
||||
|
||||
# XXX setting request.type="http" is a hack to stop urllib from trying
|
||||
# to tunnel if url is https
|
||||
request.type = "http"
|
||||
request.set_proxy("localhost:8000", "http")
|
||||
|
||||
request.set_proxy(warcprox_address, "http")
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(request) as response:
|
||||
if response.status != 204:
|
||||
@ -93,14 +86,14 @@ class BrozzlerWorker:
|
||||
except urllib.error.HTTPError as e:
|
||||
logging.warn("""got "{} {}" response on warcprox PUTMETA request (expected 204)""".format(e.getcode(), e.info()))
|
||||
|
||||
def _try_youtube_dl(self, site, crawl_url):
|
||||
def _try_youtube_dl(self, ydl, site, crawl_url):
|
||||
try:
|
||||
logging.info("trying youtube-dl on {}".format(crawl_url))
|
||||
info = self._ydl.extract_info(crawl_url.url)
|
||||
if self._proxy_server and self._enable_warcprox_features:
|
||||
info = ydl.extract_info(crawl_url.url)
|
||||
if site.proxy and site.enable_warcprox_features:
|
||||
info_json = json.dumps(info, sort_keys=True, indent=4)
|
||||
logging.info("sending PUTMETA request to warcprox with youtube-dl json for {}".format(crawl_url))
|
||||
self._putmeta(url=crawl_url.url,
|
||||
self._putmeta(warcprox_address, site.proxy, url=crawl_url.url,
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"))
|
||||
except BaseException as e:
|
||||
@ -110,32 +103,34 @@ class BrozzlerWorker:
|
||||
raise
|
||||
|
||||
def _on_screenshot(self, site, crawl_url, screenshot_png):
|
||||
if self._proxy_server and self._enable_warcprox_features:
|
||||
if site.proxy and site.enable_warcprox_features:
|
||||
logging.info("sending PUTMETA request to warcprox with screenshot for {}".format(crawl_url))
|
||||
self._putmeta(url=crawl_url.url, content_type="image/png", payload=screenshot_png)
|
||||
self._putmeta(warcprox_address=site.proxy, url=crawl_url.url,
|
||||
content_type="image/png", payload=screenshot_png)
|
||||
|
||||
def _brozzle_site(self, browser, site):
|
||||
def _brozzle_site(self, browser, ydl, site):
|
||||
start = time.time()
|
||||
crawl_url = None
|
||||
try:
|
||||
with browser:
|
||||
while not self._shutdown_requested.is_set() and time.time() - start < 60:
|
||||
try:
|
||||
crawl_url = self._next_url(site)
|
||||
logging.info("crawling {}".format(crawl_url))
|
||||
self._try_youtube_dl(site, crawl_url)
|
||||
crawl_url.outlinks = browser.browse_page(crawl_url.url,
|
||||
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
|
||||
self._completed_url(site, crawl_url)
|
||||
crawl_url = None
|
||||
except kombu.simple.Empty:
|
||||
# if some timeout reached, re-raise?
|
||||
pass
|
||||
browser.start(proxy=site.proxy)
|
||||
while not self._shutdown_requested.is_set() and time.time() - start < 60:
|
||||
try:
|
||||
crawl_url = self._next_url(site)
|
||||
logging.info("crawling {}".format(crawl_url))
|
||||
self._try_youtube_dl(ydl, site, crawl_url)
|
||||
crawl_url.outlinks = browser.browse_page(crawl_url.url,
|
||||
on_screenshot=lambda screenshot_png: self._on_screenshot(site, crawl_url, screenshot_png))
|
||||
self._completed_url(site, crawl_url)
|
||||
crawl_url = None
|
||||
except kombu.simple.Empty:
|
||||
# if some timeout reached, re-raise?
|
||||
pass
|
||||
# except kombu.simple.Empty:
|
||||
# logging.info("finished {} (queue is empty)".format(site))
|
||||
except brozzler.browser.BrowsingAborted:
|
||||
logging.info("{} shut down".format(browser))
|
||||
finally:
|
||||
browser.stop()
|
||||
self._disclaim_site(site, crawl_url)
|
||||
self._browser_pool.release(browser)
|
||||
|
||||
@ -153,7 +148,8 @@ class BrozzlerWorker:
|
||||
site = brozzler.Site(**msg.payload)
|
||||
msg.ack() # XXX ack only after browsing finished? kinda complicated
|
||||
logging.info("browsing site {}".format(site))
|
||||
th = threading.Thread(target=lambda: self._brozzle_site(browser, site),
|
||||
ydl = self._youtube_dl(site)
|
||||
th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site),
|
||||
name="BrowsingThread-{}".format(site.scope_surt))
|
||||
th.start()
|
||||
except kombu.simple.Empty:
|
||||
@ -164,7 +160,7 @@ class BrozzlerWorker:
|
||||
latest_state = "browsers-busy"
|
||||
else:
|
||||
q_empty = True
|
||||
|
||||
|
||||
if q_empty:
|
||||
if latest_state != "no-unclaimed-sites":
|
||||
logging.info("no unclaimed sites to browse")
|
||||
|
@ -4,3 +4,4 @@ argparse
|
||||
PyYAML
|
||||
git+https://github.com/ikreymer/surt.git@py3
|
||||
youtube_dl
|
||||
git+https://github.com/seomoz/reppy.git
|
||||
|
Loading…
x
Reference in New Issue
Block a user