brozzler/brozzler/robots.py

96 lines
3.4 KiB
Python

'''
brozzler/robots.py - robots.txt support
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
user-agent matching. We're sticking with 0.3.4 because later versions don't
support supplying a custom requests.Session.
See also https://github.com/seomoz/reppy/issues/37
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import json
import logging
import brozzler
import reppy
import reppy.cache
import reppy.parser
import requests
__all__ = ["is_permitted_by_robots"]
# monkey-patch reppy to do substring user-agent matching, see top of file
reppy.Utility.short_user_agent = lambda strng: strng
def _reppy_rules_getitem(self, agent):
'''
Find the user-agent token matching the supplied full user-agent, using
a case-insensitive substring search.
'''
lc_agent = agent.lower()
for s in self.agents:
if s in lc_agent:
return self.agents[s]
return self.agents.get('*')
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site, proxy=None):
class SessionRaiseOn420(requests.Session):
def get(self, url, *args, **kwargs):
res = super().get(url, *args, **kwargs)
if res.status_code == 420 and 'warcprox-meta' in res.headers:
raise brozzler.ReachedLimit(
warcprox_meta=json.loads(res.headers['warcprox-meta']),
http_payload=res.text)
else:
return res
if not site.id in _robots_caches:
req_sesh = SessionRaiseOn420()
req_sesh.verify = False # ignore cert errors
if proxy:
proxie = "http://%s" % proxy
req_sesh.proxies = {"http":proxie,"https":proxie}
if site.extra_headers():
req_sesh.headers.update(site.extra_headers())
if site.user_agent:
req_sesh.headers['User-Agent'] = site.user_agent
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
return _robots_caches[site.id]
def is_permitted_by_robots(site, url, proxy=None):
if site.ignore_robots:
return True
tries_left = 10
while True:
try:
result = _robots_cache(site, proxy).allowed(
url, site.user_agent or "brozzler")
return result
except BaseException as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
raise e.args[0]
else:
if tries_left > 0:
logging.warn("caught exception fetching robots.txt (%s tries left) for %s: %s", tries_left, url, repr(e))
tries_left -= 1
else:
logging.error("caught exception fetching robots.txt (0 tries left) for %s: %s", url, repr(e), exc_info=True)
return False