From 3aead6de93f2b381fad70e61b824d392efe0aec8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 16 Nov 2016 11:41:34 -0800 Subject: [PATCH] monkey-patch reppy to support substring user-agent matching --- brozzler/robots.py | 25 ++++++++++++++++++++- setup.py | 2 +- tests/test_units.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 tests/test_units.py diff --git a/brozzler/robots.py b/brozzler/robots.py index cb5ffe1..26329d1 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -1,6 +1,12 @@ ''' brozzler/robots.py - robots.txt support +Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring +user-agent matching. We're sticking with 0.3.4 because later versions don't +support supplying a custom requests.Session. + +See also https://github.com/seomoz/reppy/issues/37 + Copyright (C) 2014-2016 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,11 +25,27 @@ limitations under the License. import json import logging import brozzler +import reppy import reppy.cache +import reppy.parser import requests __all__ = ["is_permitted_by_robots"] +# monkey-patch reppy to do substring user-agent matching, see top of file +reppy.Utility.short_user_agent = lambda strng: strng +def _reppy_rules_getitem(self, agent): + ''' + Find the user-agent token matching the supplied full user-agent, using + a case-insensitive substring search. + ''' + lc_agent = agent.lower() + for s in self.agents: + if s in lc_agent: + return self.agents[s] + return self.agents.get('*') +reppy.parser.Rules.__getitem__ = _reppy_rules_getitem + _robots_caches = {} # {site_id:reppy.cache.RobotsCache} def _robots_cache(site): class SessionRaiseOn420(requests.Session): @@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url): tries_left = 10 while True: try: - result = _robots_cache(site).allowed(url, "brozzler") + result = _robots_cache(site).allowed( + url, site.user_agent or "brozzler") return result except BaseException as e: if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit): diff --git a/setup.py b/setup.py index 4a92599..823fb3f 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev125', + version='1.1b8.dev126', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_units.py b/tests/test_units.py new file mode 100644 index 0000000..2fee049 --- /dev/null +++ b/tests/test_units.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +''' +test_units.py - some unit tests for parts of brozzler amenable to that + +Copyright (C) 2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import pytest +import http.server +import threading +import os +import brozzler + +@pytest.fixture(scope='module') +def httpd(request): + # SimpleHTTPRequestHandler always uses CWD so we have to chdir + os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + + httpd = http.server.HTTPServer( + ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + def fin(): + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + request.addfinalizer(fin) + + return httpd + +def test_robots(httpd): + ''' + Basic test of robots.txt user-agent substring matching. + ''' + url = 'http://localhost:%s/' % httpd.server_port + site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep') + assert brozzler.is_permitted_by_robots(site, url) + + site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh') + assert not brozzler.is_permitted_by_robots(site, url) +