mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
monkey-patch reppy to support substring user-agent matching
This commit is contained in:
parent
398871d46b
commit
3aead6de93
@ -1,6 +1,12 @@
|
||||
'''
|
||||
brozzler/robots.py - robots.txt support
|
||||
|
||||
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
|
||||
user-agent matching. We're sticking with 0.3.4 because later versions don't
|
||||
support supplying a custom requests.Session.
|
||||
|
||||
See also https://github.com/seomoz/reppy/issues/37
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -19,11 +25,27 @@ limitations under the License.
|
||||
import json
|
||||
import logging
|
||||
import brozzler
|
||||
import reppy
|
||||
import reppy.cache
|
||||
import reppy.parser
|
||||
import requests
|
||||
|
||||
__all__ = ["is_permitted_by_robots"]
|
||||
|
||||
# monkey-patch reppy to do substring user-agent matching, see top of file
|
||||
reppy.Utility.short_user_agent = lambda strng: strng
|
||||
def _reppy_rules_getitem(self, agent):
|
||||
'''
|
||||
Find the user-agent token matching the supplied full user-agent, using
|
||||
a case-insensitive substring search.
|
||||
'''
|
||||
lc_agent = agent.lower()
|
||||
for s in self.agents:
|
||||
if s in lc_agent:
|
||||
return self.agents[s]
|
||||
return self.agents.get('*')
|
||||
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||
|
||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||
def _robots_cache(site):
|
||||
class SessionRaiseOn420(requests.Session):
|
||||
@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
|
||||
tries_left = 10
|
||||
while True:
|
||||
try:
|
||||
result = _robots_cache(site).allowed(url, "brozzler")
|
||||
result = _robots_cache(site).allowed(
|
||||
url, site.user_agent or "brozzler")
|
||||
return result
|
||||
except BaseException as e:
|
||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b8.dev125',
|
||||
version='1.1b8.dev126',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
54
tests/test_units.py
Normal file
54
tests/test_units.py
Normal file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
test_units.py - some unit tests for parts of brozzler amenable to that
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import pytest
|
||||
import http.server
|
||||
import threading
|
||||
import os
|
||||
import brozzler
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||
|
||||
httpd = http.server.HTTPServer(
|
||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
def fin():
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
request.addfinalizer(fin)
|
||||
|
||||
return httpd
|
||||
|
||||
def test_robots(httpd):
|
||||
'''
|
||||
Basic test of robots.txt user-agent substring matching.
|
||||
'''
|
||||
url = 'http://localhost:%s/' % httpd.server_port
|
||||
site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
|
||||
assert brozzler.is_permitted_by_robots(site, url)
|
||||
|
||||
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
||||
assert not brozzler.is_permitted_by_robots(site, url)
|
||||
|
Loading…
x
Reference in New Issue
Block a user