brozzler/brozzler/robots.py

'''
brozzler/robots.py - robots.txt support

Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
user-agent matching. We're sticking with 0.3.4 because later versions don't
support supplying a custom requests.Session.

See also https://github.com/seomoz/reppy/issues/37

Copyright (C) 2014-2016 Internet Archive

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''

import json
import logging
import brozzler
import reppy
import reppy.cache
import reppy.parser
import requests

__all__ = ["is_permitted_by_robots"]

# monkey-patch reppy to do substring user-agent matching, see top of file
reppy.Utility.short_user_agent = lambda strng: strng
def _reppy_rules_getitem(self, agent):
    '''
    Find the user-agent token matching the supplied full user-agent, using
    a case-insensitive substring search.
    '''
    lc_agent = agent.lower()
    for s in self.agents:
        if s in lc_agent:
            return self.agents[s]
    return self.agents.get('*')
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem

_robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site, proxy=None):
    class SessionRaiseOn420(requests.Session):
        def get(self, url, *args, **kwargs):
            res = super().get(url, *args, **kwargs)
            if res.status_code == 420 and 'warcprox-meta' in res.headers:
                raise brozzler.ReachedLimit(
                        warcprox_meta=json.loads(res.headers['warcprox-meta']),
                        http_payload=res.text)
            else:
                return res

    if not site.id in _robots_caches:
        req_sesh = SessionRaiseOn420()
        req_sesh.verify = False   # ignore cert errors
        if proxy:
            proxie = "http://%s" % proxy
            req_sesh.proxies = {"http":proxie,"https":proxie}
        if site.extra_headers():
            req_sesh.headers.update(site.extra_headers())
        if site.user_agent:
            req_sesh.headers['User-Agent'] = site.user_agent
        _robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)

    return _robots_caches[site.id]

def is_permitted_by_robots(site, url, proxy=None):
    '''
    Checks if `url` is permitted by robots.txt.

    In case of problems fetching robots.txt, different things can happen.
    Reppy (the robots.txt parsing library) handles some exceptions internally
    and applies an appropriate policy. It bubbles up other exceptions. Of
    these, there are two kinds that this function raises for the caller to
    handle, described below. Yet other types of exceptions are caught, and the
    fetch is retried up to 10 times. In this case, after the 10th failure, the
    function returns `False` (i.e. forbidden by robots).

    Returns:
        bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
            by robots.txt, `False` otherwise

    Raises:
        brozzler.ReachedLimit: if warcprox responded with 420 Reached Limit
        requests.exceptions.ProxyError: if the proxy is down
    '''
    if site.ignore_robots:
        return True

    tries_left = 10
    while True:
        try:
            result = _robots_cache(site, proxy).allowed(
                    url, site.user_agent or "brozzler")
            return result
        except Exception as e:
            if isinstance(e, reppy.exceptions.ServerError) and isinstance(
                    e.args[0], brozzler.ReachedLimit):
                raise e.args[0]
            elif hasattr(e, 'args') and isinstance(
                    e.args[0], requests.exceptions.ProxyError):
                # reppy has wrapped an exception that we want to bubble up
                raise brozzler.ProxyError(e)
            else:
                if tries_left > 0:
                    logging.warn(
                            "caught exception fetching robots.txt (%r tries "
                            "left) for %r: %r", tries_left, url, e)
                    tries_left -= 1
                else:
                    logging.error(
                            "caught exception fetching robots.txt (0 tries "
                            "left) for %r: %r", url, e, exc_info=True)
                    return False