mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
monkey-patch reppy to support substring user-agent matching
This commit is contained in:
parent
398871d46b
commit
3aead6de93
@ -1,6 +1,12 @@
|
|||||||
'''
|
'''
|
||||||
brozzler/robots.py - robots.txt support
|
brozzler/robots.py - robots.txt support
|
||||||
|
|
||||||
|
Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
|
||||||
|
user-agent matching. We're sticking with 0.3.4 because later versions don't
|
||||||
|
support supplying a custom requests.Session.
|
||||||
|
|
||||||
|
See also https://github.com/seomoz/reppy/issues/37
|
||||||
|
|
||||||
Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -19,11 +25,27 @@ limitations under the License.
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import brozzler
|
import brozzler
|
||||||
|
import reppy
|
||||||
import reppy.cache
|
import reppy.cache
|
||||||
|
import reppy.parser
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
__all__ = ["is_permitted_by_robots"]
|
__all__ = ["is_permitted_by_robots"]
|
||||||
|
|
||||||
|
# monkey-patch reppy to do substring user-agent matching, see top of file
|
||||||
|
reppy.Utility.short_user_agent = lambda strng: strng
|
||||||
|
def _reppy_rules_getitem(self, agent):
|
||||||
|
'''
|
||||||
|
Find the user-agent token matching the supplied full user-agent, using
|
||||||
|
a case-insensitive substring search.
|
||||||
|
'''
|
||||||
|
lc_agent = agent.lower()
|
||||||
|
for s in self.agents:
|
||||||
|
if s in lc_agent:
|
||||||
|
return self.agents[s]
|
||||||
|
return self.agents.get('*')
|
||||||
|
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
|
||||||
|
|
||||||
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
_robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||||
def _robots_cache(site):
|
def _robots_cache(site):
|
||||||
class SessionRaiseOn420(requests.Session):
|
class SessionRaiseOn420(requests.Session):
|
||||||
@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
|
|||||||
tries_left = 10
|
tries_left = 10
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
result = _robots_cache(site).allowed(url, "brozzler")
|
result = _robots_cache(site).allowed(
|
||||||
|
url, site.user_agent or "brozzler")
|
||||||
return result
|
return result
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b8.dev125',
|
version='1.1b8.dev126',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
54
tests/test_units.py
Normal file
54
tests/test_units.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
'''
|
||||||
|
test_units.py - some unit tests for parts of brozzler amenable to that
|
||||||
|
|
||||||
|
Copyright (C) 2016 Internet Archive
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
'''
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import http.server
|
||||||
|
import threading
|
||||||
|
import os
|
||||||
|
import brozzler
|
||||||
|
|
||||||
|
@pytest.fixture(scope='module')
|
||||||
|
def httpd(request):
|
||||||
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||||
|
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||||
|
|
||||||
|
httpd = http.server.HTTPServer(
|
||||||
|
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
||||||
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||||
|
httpd_thread.start()
|
||||||
|
|
||||||
|
def fin():
|
||||||
|
httpd.shutdown()
|
||||||
|
httpd.server_close()
|
||||||
|
httpd_thread.join()
|
||||||
|
request.addfinalizer(fin)
|
||||||
|
|
||||||
|
return httpd
|
||||||
|
|
||||||
|
def test_robots(httpd):
|
||||||
|
'''
|
||||||
|
Basic test of robots.txt user-agent substring matching.
|
||||||
|
'''
|
||||||
|
url = 'http://localhost:%s/' % httpd.server_port
|
||||||
|
site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
|
||||||
|
assert brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
||||||
|
site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
|
||||||
|
assert not brozzler.is_permitted_by_robots(site, url)
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user