mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge pull request #110 from nlevitt/robots-errors
treat any error fetching robots.txt as "allow all"
This commit is contained in:
commit
05ec6a68b0
@ -46,20 +46,21 @@ def _reppy_rules_getitem(self, agent):
|
||||
return self.agents.get('*')
|
||||
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||
|
||||
class _SessionRaiseOn420(requests.Session):
|
||||
timeout = 60
|
||||
def get(self, url, *args, **kwargs):
|
||||
res = super().get(url, timeout=self.timeout, *args, **kwargs)
|
||||
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
||||
raise brozzler.ReachedLimit(
|
||||
warcprox_meta=json.loads(res.headers['warcprox-meta']),
|
||||
http_payload=res.text)
|
||||
else:
|
||||
return res
|
||||
|
||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||
def _robots_cache(site, proxy=None):
|
||||
class SessionRaiseOn420(requests.Session):
|
||||
def get(self, url, *args, **kwargs):
|
||||
res = super().get(url, *args, **kwargs)
|
||||
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
||||
raise brozzler.ReachedLimit(
|
||||
warcprox_meta=json.loads(res.headers['warcprox-meta']),
|
||||
http_payload=res.text)
|
||||
else:
|
||||
return res
|
||||
|
||||
if not site.id in _robots_caches:
|
||||
req_sesh = SessionRaiseOn420()
|
||||
req_sesh = _SessionRaiseOn420()
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
if proxy:
|
||||
proxie = "http://%s" % proxy
|
||||
@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None):
|
||||
req_sesh.headers.update(site.extra_headers())
|
||||
if site.user_agent:
|
||||
req_sesh.headers['User-Agent'] = site.user_agent
|
||||
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||
_robots_caches[site.id] = reppy.cache.RobotsCache(
|
||||
session=req_sesh, disallow_forbidden=False)
|
||||
|
||||
return _robots_caches[site.id]
|
||||
|
||||
@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||
'''
|
||||
Checks if `url` is permitted by robots.txt.
|
||||
|
||||
In case of problems fetching robots.txt, different things can happen.
|
||||
Reppy (the robots.txt parsing library) handles some exceptions internally
|
||||
and applies an appropriate policy. It bubbles up other exceptions. Of
|
||||
these, there are two kinds that this function raises for the caller to
|
||||
handle, described below. Yet other types of exceptions are caught, and the
|
||||
fetch is retried up to 10 times. In this case, after the 10th failure, the
|
||||
function returns `False` (i.e. forbidden by robots).
|
||||
Treats any kind of error fetching robots.txt as "allow all". See
|
||||
http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI)
|
||||
for some background on that policy.
|
||||
|
||||
Returns:
|
||||
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
|
||||
@ -95,29 +93,21 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||
if site.ignore_robots:
|
||||
return True
|
||||
|
||||
tries_left = 10
|
||||
while True:
|
||||
try:
|
||||
result = _robots_cache(site, proxy).allowed(
|
||||
url, site.user_agent or "brozzler")
|
||||
return result
|
||||
except Exception as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
|
||||
e.args[0], brozzler.ReachedLimit):
|
||||
raise e.args[0]
|
||||
elif hasattr(e, 'args') and isinstance(
|
||||
e.args[0], requests.exceptions.ProxyError):
|
||||
# reppy has wrapped an exception that we want to bubble up
|
||||
raise brozzler.ProxyError(e)
|
||||
else:
|
||||
if tries_left > 0:
|
||||
logging.warn(
|
||||
"caught exception fetching robots.txt (%r tries "
|
||||
"left) for %r: %r", tries_left, url, e)
|
||||
tries_left -= 1
|
||||
else:
|
||||
logging.error(
|
||||
"caught exception fetching robots.txt (0 tries "
|
||||
"left) for %r: %r", url, e, exc_info=True)
|
||||
return False
|
||||
try:
|
||||
result = _robots_cache(site, proxy).allowed(
|
||||
url, site.user_agent or "brozzler")
|
||||
return result
|
||||
except Exception as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(
|
||||
e.args[0], brozzler.ReachedLimit):
|
||||
raise e.args[0]
|
||||
elif hasattr(e, 'args') and isinstance(
|
||||
e.args[0], requests.exceptions.ProxyError):
|
||||
# reppy has wrapped an exception that we want to bubble up
|
||||
raise brozzler.ProxyError(e)
|
||||
else:
|
||||
logging.warn(
|
||||
"returning true (permitted) after problem fetching "
|
||||
"robots.txt for %r: %r", url, e)
|
||||
return True
|
||||
|
||||
|
@ -769,7 +769,7 @@ def test_time_limit(httpd):
|
||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# create a new job with three sites that could be crawled forever
|
||||
# create a new job with one seed that could be crawled forever
|
||||
job_conf = {'seeds': [{
|
||||
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
|
||||
'time_limit': 20}]}
|
||||
@ -789,6 +789,10 @@ def test_time_limit(httpd):
|
||||
assert sites[0].status == 'FINISHED_TIME_LIMIT'
|
||||
|
||||
# all sites finished so job should be finished too
|
||||
start = time.time()
|
||||
job.refresh()
|
||||
while not job.status == 'FINISHED' and time.time() - start < 10:
|
||||
time.sleep(0.5)
|
||||
job.refresh()
|
||||
assert job.status == 'FINISHED'
|
||||
|
||||
|
@ -32,6 +32,7 @@ import uuid
|
||||
import socket
|
||||
import time
|
||||
import sys
|
||||
import threading
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=logging.INFO, format=(
|
||||
@ -67,6 +68,87 @@ def test_robots(httpd):
|
||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
|
||||
assert not brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
def test_robots_http_statuses():
|
||||
for status in (
|
||||
200, 204, 400, 401, 402, 403, 404, 405,
|
||||
500, 501, 502, 503, 504, 505):
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
response = (('HTTP/1.1 %s Meaningless message\r\n'
|
||||
+ 'Content-length: 0\r\n'
|
||||
+ '\r\n') % status).encode('utf-8')
|
||||
self.connection.sendall(response)
|
||||
# self.send_response(status)
|
||||
# self.end_headers()
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
def test_robots_empty_response():
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.connection.shutdown(socket.SHUT_RDWR)
|
||||
self.connection.close()
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
def test_robots_socket_timeout():
|
||||
stop_hanging = threading.Event()
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
stop_hanging.wait(60)
|
||||
self.connection.sendall(
|
||||
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
|
||||
|
||||
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
|
||||
|
||||
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
try:
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
brozzler.robots._SessionRaiseOn420.timeout = 2
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
finally:
|
||||
brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
|
||||
stop_hanging.set()
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
|
||||
def test_robots_dns_failure():
|
||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||
url = 'http://whatever.invalid./'
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
def test_robots_connection_failure():
|
||||
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||
url = 'http://localhost:4/' # nobody listens on port 4
|
||||
site = brozzler.Site(None, {'seed': url})
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
def test_scoping():
|
||||
test_scope = yaml.load('''
|
||||
max_hops: 100
|
||||
|
Loading…
x
Reference in New Issue
Block a user