mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-01 19:16:15 -04:00
Merge pull request #110 from nlevitt/robots-errors
treat any error fetching robots.txt as "allow all"
This commit is contained in:
commit
05ec6a68b0
3 changed files with 121 additions and 45 deletions
|
@ -46,11 +46,10 @@ def _reppy_rules_getitem(self, agent):
|
||||||
return self.agents.get('*')
|
return self.agents.get('*')
|
||||||
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||||
|
|
||||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
class _SessionRaiseOn420(requests.Session):
|
||||||
def _robots_cache(site, proxy=None):
|
timeout = 60
|
||||||
class SessionRaiseOn420(requests.Session):
|
|
||||||
def get(self, url, *args, **kwargs):
|
def get(self, url, *args, **kwargs):
|
||||||
res = super().get(url, *args, **kwargs)
|
res = super().get(url, timeout=self.timeout, *args, **kwargs)
|
||||||
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
if res.status_code == 420 and 'warcprox-meta' in res.headers:
|
||||||
raise brozzler.ReachedLimit(
|
raise brozzler.ReachedLimit(
|
||||||
warcprox_meta=json.loads(res.headers['warcprox-meta']),
|
warcprox_meta=json.loads(res.headers['warcprox-meta']),
|
||||||
|
@ -58,8 +57,10 @@ def _robots_cache(site, proxy=None):
|
||||||
else:
|
else:
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||||
|
def _robots_cache(site, proxy=None):
|
||||||
if not site.id in _robots_caches:
|
if not site.id in _robots_caches:
|
||||||
req_sesh = SessionRaiseOn420()
|
req_sesh = _SessionRaiseOn420()
|
||||||
req_sesh.verify = False # ignore cert errors
|
req_sesh.verify = False # ignore cert errors
|
||||||
if proxy:
|
if proxy:
|
||||||
proxie = "http://%s" % proxy
|
proxie = "http://%s" % proxy
|
||||||
|
@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None):
|
||||||
req_sesh.headers.update(site.extra_headers())
|
req_sesh.headers.update(site.extra_headers())
|
||||||
if site.user_agent:
|
if site.user_agent:
|
||||||
req_sesh.headers['User-Agent'] = site.user_agent
|
req_sesh.headers['User-Agent'] = site.user_agent
|
||||||
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
_robots_caches[site.id] = reppy.cache.RobotsCache(
|
||||||
|
session=req_sesh, disallow_forbidden=False)
|
||||||
|
|
||||||
return _robots_caches[site.id]
|
return _robots_caches[site.id]
|
||||||
|
|
||||||
|
@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||||
'''
|
'''
|
||||||
Checks if `url` is permitted by robots.txt.
|
Checks if `url` is permitted by robots.txt.
|
||||||
|
|
||||||
In case of problems fetching robots.txt, different things can happen.
|
Treats any kind of error fetching robots.txt as "allow all". See
|
||||||
Reppy (the robots.txt parsing library) handles some exceptions internally
|
http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI)
|
||||||
and applies an appropriate policy. It bubbles up other exceptions. Of
|
for some background on that policy.
|
||||||
these, there are two kinds that this function raises for the caller to
|
|
||||||
handle, described below. Yet other types of exceptions are caught, and the
|
|
||||||
fetch is retried up to 10 times. In this case, after the 10th failure, the
|
|
||||||
function returns `False` (i.e. forbidden by robots).
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
|
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
|
||||||
|
@ -95,8 +93,6 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||||
if site.ignore_robots:
|
if site.ignore_robots:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
tries_left = 10
|
|
||||||
while True:
|
|
||||||
try:
|
try:
|
||||||
result = _robots_cache(site, proxy).allowed(
|
result = _robots_cache(site, proxy).allowed(
|
||||||
url, site.user_agent or "brozzler")
|
url, site.user_agent or "brozzler")
|
||||||
|
@ -110,14 +106,8 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||||
# reppy has wrapped an exception that we want to bubble up
|
# reppy has wrapped an exception that we want to bubble up
|
||||||
raise brozzler.ProxyError(e)
|
raise brozzler.ProxyError(e)
|
||||||
else:
|
else:
|
||||||
if tries_left > 0:
|
|
||||||
logging.warn(
|
logging.warn(
|
||||||
"caught exception fetching robots.txt (%r tries "
|
"returning true (permitted) after problem fetching "
|
||||||
"left) for %r: %r", tries_left, url, e)
|
"robots.txt for %r: %r", url, e)
|
||||||
tries_left -= 1
|
return True
|
||||||
else:
|
|
||||||
logging.error(
|
|
||||||
"caught exception fetching robots.txt (0 tries "
|
|
||||||
"left) for %r: %r", url, e, exc_info=True)
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
|
@ -769,7 +769,7 @@ def test_time_limit(httpd):
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
# create a new job with three sites that could be crawled forever
|
# create a new job with one seed that could be crawled forever
|
||||||
job_conf = {'seeds': [{
|
job_conf = {'seeds': [{
|
||||||
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
|
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
|
||||||
'time_limit': 20}]}
|
'time_limit': 20}]}
|
||||||
|
@ -789,6 +789,10 @@ def test_time_limit(httpd):
|
||||||
assert sites[0].status == 'FINISHED_TIME_LIMIT'
|
assert sites[0].status == 'FINISHED_TIME_LIMIT'
|
||||||
|
|
||||||
# all sites finished so job should be finished too
|
# all sites finished so job should be finished too
|
||||||
|
start = time.time()
|
||||||
|
job.refresh()
|
||||||
|
while not job.status == 'FINISHED' and time.time() - start < 10:
|
||||||
|
time.sleep(0.5)
|
||||||
job.refresh()
|
job.refresh()
|
||||||
assert job.status == 'FINISHED'
|
assert job.status == 'FINISHED'
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,7 @@ import uuid
|
||||||
import socket
|
import socket
|
||||||
import time
|
import time
|
||||||
import sys
|
import sys
|
||||||
|
import threading
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stderr, level=logging.INFO, format=(
|
stream=sys.stderr, level=logging.INFO, format=(
|
||||||
|
@ -67,6 +68,87 @@ def test_robots(httpd):
|
||||||
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
|
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
|
||||||
assert not brozzler.is_permitted_by_robots(site, url)
|
assert not brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
|
def test_robots_http_statuses():
|
||||||
|
for status in (
|
||||||
|
200, 204, 400, 401, 402, 403, 404, 405,
|
||||||
|
500, 501, 502, 503, 504, 505):
|
||||||
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
response = (('HTTP/1.1 %s Meaningless message\r\n'
|
||||||
|
+ 'Content-length: 0\r\n'
|
||||||
|
+ '\r\n') % status).encode('utf-8')
|
||||||
|
self.connection.sendall(response)
|
||||||
|
# self.send_response(status)
|
||||||
|
# self.end_headers()
|
||||||
|
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||||
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||||
|
httpd_thread.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = 'http://localhost:%s/' % httpd.server_port
|
||||||
|
site = brozzler.Site(None, {'seed': url})
|
||||||
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
finally:
|
||||||
|
httpd.shutdown()
|
||||||
|
httpd.server_close()
|
||||||
|
httpd_thread.join()
|
||||||
|
|
||||||
|
def test_robots_empty_response():
|
||||||
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
self.connection.shutdown(socket.SHUT_RDWR)
|
||||||
|
self.connection.close()
|
||||||
|
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||||
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||||
|
httpd_thread.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = 'http://localhost:%s/' % httpd.server_port
|
||||||
|
site = brozzler.Site(None, {'seed': url})
|
||||||
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
finally:
|
||||||
|
httpd.shutdown()
|
||||||
|
httpd.server_close()
|
||||||
|
httpd_thread.join()
|
||||||
|
|
||||||
|
def test_robots_socket_timeout():
|
||||||
|
stop_hanging = threading.Event()
|
||||||
|
class Handler(http.server.BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
stop_hanging.wait(60)
|
||||||
|
self.connection.sendall(
|
||||||
|
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
|
||||||
|
|
||||||
|
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
|
||||||
|
|
||||||
|
httpd = http.server.HTTPServer(('localhost', 0), Handler)
|
||||||
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||||
|
httpd_thread.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
url = 'http://localhost:%s/' % httpd.server_port
|
||||||
|
site = brozzler.Site(None, {'seed': url})
|
||||||
|
brozzler.robots._SessionRaiseOn420.timeout = 2
|
||||||
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
finally:
|
||||||
|
brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
|
||||||
|
stop_hanging.set()
|
||||||
|
httpd.shutdown()
|
||||||
|
httpd.server_close()
|
||||||
|
httpd_thread.join()
|
||||||
|
|
||||||
|
def test_robots_dns_failure():
|
||||||
|
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||||
|
url = 'http://whatever.invalid./'
|
||||||
|
site = brozzler.Site(None, {'seed': url})
|
||||||
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
|
def test_robots_connection_failure():
|
||||||
|
# .invalid. is guaranteed nonexistent per rfc 6761
|
||||||
|
url = 'http://localhost:4/' # nobody listens on port 4
|
||||||
|
site = brozzler.Site(None, {'seed': url})
|
||||||
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
def test_scoping():
|
def test_scoping():
|
||||||
test_scope = yaml.load('''
|
test_scope = yaml.load('''
|
||||||
max_hops: 100
|
max_hops: 100
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue