monkey-patch reppy to support substring user-agent matching

2025-08-08 06:22:23 -04:00 · 2016-11-16 11:41:34 -08:00 · 2016-11-16 11:41:34 -08:00 · 3aead6de93
commit 3aead6de93
parent 398871d46b
3 changed files with 79 additions and 2 deletions
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -1,6 +1,12 @@
 '''
 brozzler/robots.py - robots.txt support
 Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
 user-agent matching. We're sticking with 0.3.4 because later versions don't
 support supplying a custom requests.Session.
 See also https://github.com/seomoz/reppy/issues/37
 Copyright (C) 2014-2016 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License");
@ -19,11 +25,27 @@ limitations under the License.
 import json
 import logging
 import brozzler
 import reppy
 import reppy.cache
 import reppy.parser
 import requests
 __all__ = ["is_permitted_by_robots"]
 # monkey-patch reppy to do substring user-agent matching, see top of file
 reppy.Utility.short_user_agent = lambda strng: strng
 def _reppy_rules_getitem(self, agent):
    '''
    Find the user-agent token matching the supplied full user-agent, using
    a case-insensitive substring search.
    '''
    lc_agent = agent.lower()
    for s in self.agents:
        if s in lc_agent:
            return self.agents[s]
    return self.agents.get('*')
 reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site):
    class SessionRaiseOn420(requests.Session):
@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
    tries_left = 10
    while True:
        try:
-            result = _robots_cache(site).allowed(url, "brozzler")
+            result = _robots_cache(site).allowed(
                    url, site.user_agent or "brozzler")
            return result
        except BaseException as e:
            if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
 setuptools.setup(
        name='brozzler',
-        version='1.1b8.dev125',
+        version='1.1b8.dev126',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_units.py
+++ b/tests/test_units.py
@ -0,0 +1,54 @@
 #!/usr/bin/env python
 '''
 test_units.py - some unit tests for parts of brozzler amenable to that
 Copyright (C) 2016 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 '''
 import pytest
 import http.server
 import threading
 import os
 import brozzler
@pytest.fixture(scope='module')
 def httpd(request):
    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
    httpd = http.server.HTTPServer(
            ('localhost', 0), http.server.SimpleHTTPRequestHandler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()
    def fin():
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
    request.addfinalizer(fin)
    return httpd
 def test_robots(httpd):
    '''
    Basic test of robots.txt user-agent substring matching.
    '''
    url = 'http://localhost:%s/' % httpd.server_port
    site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
    assert brozzler.is_permitted_by_robots(site, url)
    site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
    assert not brozzler.is_permitted_by_robots(site, url)