monkey-patch reppy to support substring user-agent matching

This commit is contained in:
Noah Levitt 2016-11-16 11:41:34 -08:00
parent 398871d46b
commit 3aead6de93
3 changed files with 79 additions and 2 deletions

View File

@ -1,6 +1,12 @@
''' '''
brozzler/robots.py - robots.txt support brozzler/robots.py - robots.txt support
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
user-agent matching. We're sticking with 0.3.4 because later versions don't
support supplying a custom requests.Session.
See also https://github.com/seomoz/reppy/issues/37
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
@ -19,11 +25,27 @@ limitations under the License.
import json import json
import logging import logging
import brozzler import brozzler
import reppy
import reppy.cache import reppy.cache
import reppy.parser
import requests import requests
__all__ = ["is_permitted_by_robots"] __all__ = ["is_permitted_by_robots"]
# monkey-patch reppy to do substring user-agent matching, see top of file
reppy.Utility.short_user_agent = lambda strng: strng
def _reppy_rules_getitem(self, agent):
'''
Find the user-agent token matching the supplied full user-agent, using
a case-insensitive substring search.
'''
lc_agent = agent.lower()
for s in self.agents:
if s in lc_agent:
return self.agents[s]
return self.agents.get('*')
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
_robots_caches = {} # {site_id:reppy.cache.RobotsCache} _robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site): def _robots_cache(site):
class SessionRaiseOn420(requests.Session): class SessionRaiseOn420(requests.Session):
@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
tries_left = 10 tries_left = 10
while True: while True:
try: try:
result = _robots_cache(site).allowed(url, "brozzler") result = _robots_cache(site).allowed(
url, site.user_agent or "brozzler")
return result return result
except BaseException as e: except BaseException as e:
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit): if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b8.dev125', version='1.1b8.dev126',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

54
tests/test_units.py Normal file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env python
'''
test_units.py - some unit tests for parts of brozzler amenable to that
Copyright (C) 2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import pytest
import http.server
import threading
import os
import brozzler
@pytest.fixture(scope='module')
def httpd(request):
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
httpd = http.server.HTTPServer(
('localhost', 0), http.server.SimpleHTTPRequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
def fin():
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
request.addfinalizer(fin)
return httpd
def test_robots(httpd):
'''
Basic test of robots.txt user-agent substring matching.
'''
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
assert brozzler.is_permitted_by_robots(site, url)
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
assert not brozzler.is_permitted_by_robots(site, url)