From 3aead6de93f2b381fad70e61b824d392efe0aec8 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 16 Nov 2016 11:41:34 -0800
Subject: [PATCH] monkey-patch reppy to support substring user-agent matching

---
 brozzler/robots.py  | 25 ++++++++++++++++++++-
 setup.py            |  2 +-
 tests/test_units.py | 54 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_units.py

diff --git a/brozzler/robots.py b/brozzler/robots.py
index cb5ffe1..26329d1 100644
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@@ -1,6 +1,12 @@
 '''
 brozzler/robots.py - robots.txt support
 
+Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
+user-agent matching. We're sticking with 0.3.4 because later versions don't
+support supplying a custom requests.Session.
+
+See also https://github.com/seomoz/reppy/issues/37
+
 Copyright (C) 2014-2016 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,11 +25,27 @@ limitations under the License.
 import json
 import logging
 import brozzler
+import reppy
 import reppy.cache
+import reppy.parser
 import requests
 
 __all__ = ["is_permitted_by_robots"]
 
+# monkey-patch reppy to do substring user-agent matching, see top of file
+reppy.Utility.short_user_agent = lambda strng: strng
+def _reppy_rules_getitem(self, agent):
+    '''
+    Find the user-agent token matching the supplied full user-agent, using
+    a case-insensitive substring search.
+    '''
+    lc_agent = agent.lower()
+    for s in self.agents:
+        if s in lc_agent:
+            return self.agents[s]
+    return self.agents.get('*')
+reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
+
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site):
     class SessionRaiseOn420(requests.Session):
@@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
     tries_left = 10
     while True:
         try:
-            result = _robots_cache(site).allowed(url, "brozzler")
+            result = _robots_cache(site).allowed(
+                    url, site.user_agent or "brozzler")
             return result
         except BaseException as e:
             if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
diff --git a/setup.py b/setup.py
index 4a92599..823fb3f 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def find_package_data(package):
 
 setuptools.setup(
         name='brozzler',
-        version='1.1b8.dev125',
+        version='1.1b8.dev126',
         description='Distributed web crawling with browsers',
         url='https://github.com/internetarchive/brozzler',
         author='Noah Levitt',
diff --git a/tests/test_units.py b/tests/test_units.py
new file mode 100644
index 0000000..2fee049
--- /dev/null
+++ b/tests/test_units.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+'''
+test_units.py - some unit tests for parts of brozzler amenable to that
+
+Copyright (C) 2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import pytest
+import http.server
+import threading
+import os
+import brozzler
+
+@pytest.fixture(scope='module')
+def httpd(request):
+    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
+    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+
+    httpd = http.server.HTTPServer(
+            ('localhost', 0), http.server.SimpleHTTPRequestHandler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    def fin():
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+    request.addfinalizer(fin)
+
+    return httpd
+
+def test_robots(httpd):
+    '''
+    Basic test of robots.txt user-agent substring matching.
+    '''
+    url = 'http://localhost:%s/' % httpd.server_port
+    site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
+    assert brozzler.is_permitted_by_robots(site, url)
+
+    site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
+    assert not brozzler.is_permitted_by_robots(site, url)
+