diff --git a/.travis.yml b/.travis.yml
index 67e80b9..552bf6b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,9 +4,7 @@ python:
 sudo: required
 dist: trusty
 before_install:
-- sudo apt-add-repository -y ppa:ansible/ansible
-- sudo apt-get -qq update
-- sudo apt-get install -y ansible
+- sudo pip install ansible==2.1.3.0
 install:
 - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
 - pip install $TRAVIS_BUILD_DIR pytest
diff --git a/ansible/roles/warcprox/templates/warcprox.conf.j2 b/ansible/roles/warcprox/templates/warcprox.conf.j2
index 34101d6..917d072 100644
--- a/ansible/roles/warcprox/templates/warcprox.conf.j2
+++ b/ansible/roles/warcprox/templates/warcprox.conf.j2
@@ -14,6 +14,7 @@ console log
 
 # --profile
 exec nice warcprox \
+         --address=0.0.0.0 \
          --dir={{warcs_dir}} \
          --base32 \
          --gzip \
diff --git a/brozzler/cli.py b/brozzler/cli.py
old mode 100755
new mode 100644
diff --git a/brozzler/job.py b/brozzler/job.py
index a213eae..a178884 100644
--- a/brozzler/job.py
+++ b/brozzler/job.py
@@ -106,13 +106,11 @@ def new_site(frontier, site):
         # where a brozzler worker immediately claims the site, finds no pages
         # to crawl, and decides the site is finished
         try:
-            if brozzler.is_permitted_by_robots(site, site.seed):
-                page = brozzler.Page(site.seed, site_id=site.id,
-                    job_id=site.job_id, hops_from_seed=0, priority=1000)
-                frontier.new_page(page)
-                logging.info("queued page %s", page)
-            else:
-                logging.warn("seed url %s is blocked by robots.txt", site.seed)
+            page = brozzler.Page(
+                    site.seed, site_id=site.id, job_id=site.job_id,
+                    hops_from_seed=0, priority=1000, needs_robots_check=True)
+            frontier.new_page(page)
+            logging.info("queued page %s", page)
         finally:
             # finally block because we want to insert the Site no matter what
             frontier.new_site(site)
diff --git a/brozzler/robots.py b/brozzler/robots.py
index cb5ffe1..26329d1 100644
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@@ -1,6 +1,12 @@
 '''
 brozzler/robots.py - robots.txt support
 
+Uses the reppy library version 0.3.4. Monkey-patches reppy to support substring
+user-agent matching. We're sticking with 0.3.4 because later versions don't
+support supplying a custom requests.Session.
+
+See also https://github.com/seomoz/reppy/issues/37
+
 Copyright (C) 2014-2016 Internet Archive
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,11 +25,27 @@ limitations under the License.
 import json
 import logging
 import brozzler
+import reppy
 import reppy.cache
+import reppy.parser
 import requests
 
 __all__ = ["is_permitted_by_robots"]
 
+# monkey-patch reppy to do substring user-agent matching, see top of file
+reppy.Utility.short_user_agent = lambda strng: strng
+def _reppy_rules_getitem(self, agent):
+    '''
+    Find the user-agent token matching the supplied full user-agent, using
+    a case-insensitive substring search.
+    '''
+    lc_agent = agent.lower()
+    for s in self.agents:
+        if s in lc_agent:
+            return self.agents[s]
+    return self.agents.get('*')
+reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
+
 _robots_caches = {}  # {site_id:reppy.cache.RobotsCache}
 def _robots_cache(site):
     class SessionRaiseOn420(requests.Session):
@@ -55,7 +77,8 @@ def is_permitted_by_robots(site, url):
     tries_left = 10
     while True:
         try:
-            result = _robots_cache(site).allowed(url, "brozzler")
+            result = _robots_cache(site).allowed(
+                    url, site.user_agent or "brozzler")
             return result
         except BaseException as e:
             if isinstance(e, reppy.exceptions.ServerError) and isinstance(e.args[0], brozzler.ReachedLimit):
diff --git a/brozzler/site.py b/brozzler/site.py
index 5cccac8..8ff692a 100644
--- a/brozzler/site.py
+++ b/brozzler/site.py
@@ -80,9 +80,8 @@ class Url:
             pass
 
         # if we get here, we're looking at two hostnames
-        # XXX do we need to handle case of one punycoded idn, other not?
-        domain_parts = ip_or_domain.split(".")
-        host_parts = self.host.split(".")
+        domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
+        host_parts = self.host.encode("idna").decode("ascii").lower().split(".")
 
         return host_parts[-len(domain_parts):] == domain_parts
 
@@ -228,7 +227,7 @@ class Page(brozzler.BaseDictable):
             self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
             redirect_url=None, priority=None, claimed=False, brozzle_count=0,
             via_page_id=None, last_claimed_by=None, hops_off_surt=0,
-            outlinks=None):
+            outlinks=None, needs_robots_check=False):
         self.site_id = site_id
         self.job_id = job_id
         self.url = url
@@ -240,6 +239,7 @@ class Page(brozzler.BaseDictable):
         self.via_page_id = via_page_id
         self.hops_off_surt = hops_off_surt
         self.outlinks = outlinks
+        self.needs_robots_check = needs_robots_check
         self._canon_hurl = None
 
         if priority is not None:
diff --git a/brozzler/worker.py b/brozzler/worker.py
index c306d95..21aa22e 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -327,12 +327,18 @@ class BrozzlerWorker:
                 self._frontier.honor_stop_request(site.job_id)
                 page = self._frontier.claim_page(site, "%s:%s" % (
                     socket.gethostname(), browser.chrome_port))
-                outlinks = self.brozzle_page(browser, site, page)
-                if browser.is_running():
-                    site.cookie_db = browser.persist_and_read_cookie_db()
+
+                if (page.needs_robots_check and
+                        not brozzler.is_permitted_by_robots(site, page.url)):
+                    logging.warn("page %s is blocked by robots.txt", page.url)
+                else:
+                    outlinks = self.brozzle_page(browser, site, page)
+                    self._frontier.scope_and_schedule_outlinks(
+                            site, page, outlinks)
+                    if browser.is_running():
+                        site.cookie_db = browser.persist_and_read_cookie_db()
+
                 self._frontier.completed_page(site, page)
-                self._frontier.scope_and_schedule_outlinks(
-                        site, page, outlinks)
                 page = None
         except brozzler.NothingToClaim:
             self.logger.info("no pages left for site %s", site)
diff --git a/setup.py b/setup.py
index d70d02b..dc70a9c 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def find_package_data(package):
 
 setuptools.setup(
         name='brozzler',
-        version='1.1b8.dev123',
+        version='1.1b8.dev127',
         description='Distributed web crawling with browsers',
         url='https://github.com/internetarchive/brozzler',
         author='Noah Levitt',
diff --git a/tests/htdocs/robots.txt b/tests/htdocs/robots.txt
new file mode 100644
index 0000000..05ce8f2
--- /dev/null
+++ b/tests/htdocs/robots.txt
@@ -0,0 +1,2 @@
+User-agent: badbot
+Disallow: /
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 1815cc9..ef4c51a 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -29,6 +29,13 @@ import time
 import brozzler
 import datetime
 import requests
+import subprocess
+
+def start_service(service):
+    subprocess.check_call(['sudo', 'service', service, 'start'])
+
+def stop_service(service):
+    subprocess.check_call(['sudo', 'service', service, 'stop'])
 
 @pytest.fixture(scope='module')
 def httpd(request):
@@ -102,12 +109,18 @@ def test_brozzle_site(httpd):
     page1 = 'http://localhost:%s/' % httpd.server_port
     page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
 
-    assert site.id is None
-    r = rethinkstuff.Rethinker('localhost', db='brozzler')
-    frontier = brozzler.RethinkDbFrontier(r)
-    brozzler.new_site(frontier, site)
-    assert site.id is not None
-    assert len(list(frontier.site_pages(site.id))) == 1
+    # so we can examine rethinkdb before it does anything
+    try:
+        stop_service('brozzler-worker')
+
+        assert site.id is None
+        r = rethinkstuff.Rethinker('localhost', db='brozzler')
+        frontier = brozzler.RethinkDbFrontier(r)
+        brozzler.new_site(frontier, site)
+        assert site.id is not None
+        assert len(list(frontier.site_pages(site.id))) == 1
+    finally:
+        start_service('brozzler-worker')
 
     # the site should be brozzled fairly quickly
     start = time.time()
@@ -118,14 +131,17 @@ def test_brozzle_site(httpd):
 
     # check that we got the two pages we expected
     pages = list(frontier.site_pages(site.id))
-    assert len(pages) == 2
+    assert len(pages) == 3
     assert {page.url for page in pages} == {
             'http://localhost:%s/' % httpd.server_port,
+            'http://localhost:%s/robots.txt' % httpd.server_port,
             'http://localhost:%s/file1.txt' % httpd.server_port}
 
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
     # take a look at the captures table
     captures = r.table('captures').filter({'test_id':test_id}).run()
-    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
+    captures_by_url = {
+            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
     assert page1 in captures_by_url
     assert '%srobots.txt' % page1 in captures_by_url
     assert page2 in captures_by_url
@@ -140,7 +156,6 @@ def test_brozzle_site(httpd):
         os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
     assert requests.get(wb_url).content == expected_payload
 
-
 def test_warcprox_selection(httpd):
     ''' When enable_warcprox_features is true, brozzler is expected to choose
     and instance of warcprox '''
@@ -156,12 +171,17 @@ def test_warcprox_selection(httpd):
             enable_warcprox_features=True,
             warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
 
-    assert site.id is None
-    r = rethinkstuff.Rethinker('localhost', db='brozzler')
-    frontier = brozzler.RethinkDbFrontier(r)
-    brozzler.new_site(frontier, site)
-    assert site.id is not None
-    assert len(list(frontier.site_pages(site.id))) == 1
+    # so we can examine rethinkdb before it does anything
+    try:
+        stop_service('brozzler-worker')
+        assert site.id is None
+        r = rethinkstuff.Rethinker('localhost', db='brozzler')
+        frontier = brozzler.RethinkDbFrontier(r)
+        brozzler.new_site(frontier, site)
+        assert site.id is not None
+        assert len(list(frontier.site_pages(site.id))) == 1
+    finally:
+        start_service('brozzler-worker')
 
     # check proxy is set in rethink
     start = time.time()
@@ -179,14 +199,17 @@ def test_warcprox_selection(httpd):
 
     # check that we got the two pages we expected
     pages = list(frontier.site_pages(site.id))
-    assert len(pages) == 2
+    assert len(pages) == 3
     assert {page.url for page in pages} == {
             'http://localhost:%s/' % httpd.server_port,
+            'http://localhost:%s/robots.txt' % httpd.server_port,
             'http://localhost:%s/file1.txt' % httpd.server_port}
 
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
     # take a look at the captures table
     captures = r.table('captures').filter({'test_id':test_id}).run()
-    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
+    captures_by_url = {
+            c['url']:c for c in captures if c['http_method'] != 'HEAD'}
     assert page1 in captures_by_url
     assert '%srobots.txt' % page1 in captures_by_url
     assert page2 in captures_by_url
@@ -199,4 +222,57 @@ def test_warcprox_selection(httpd):
     wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
     expected_payload = open(os.path.join(
         os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
-    assert requests.get(wb_url).content == expected_payload
+    assert requests.get(
+            wb_url, allow_redirects=False).content == expected_payload
+
+def test_obey_robots(httpd):
+    test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
+    site = brozzler.Site(
+            seed='http://localhost:%s/' % httpd.server_port,
+            proxy='localhost:8000', enable_warcprox_features=True,
+            user_agent='im a badbot',   # robots.txt blocks badbot
+            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
+
+    # so we can examine rethinkdb before it does anything
+    try:
+        stop_service('brozzler-worker')
+
+        assert site.id is None
+        r = rethinkstuff.Rethinker('localhost', db='brozzler')
+        frontier = brozzler.RethinkDbFrontier(r)
+        brozzler.new_site(frontier, site)
+        assert site.id is not None
+        site_pages = list(frontier.site_pages(site.id))
+        assert len(site_pages) == 1
+        assert site_pages[0].url == site.seed
+        assert site_pages[0].needs_robots_check
+    finally:
+        start_service('brozzler-worker')
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site = frontier.site(site.id)
+    assert site.status == 'FINISHED'
+
+    # check that we got the two pages we expected
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 1
+    assert {page.url for page in pages} == {
+            'http://localhost:%s/' % httpd.server_port}
+
+    # take a look at the captures table
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
+    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
+    captures = list(r.table('captures').filter({'test_id':test_id}).run())
+    assert len(captures) == 1
+    assert captures[0]['url'] == robots_url
+
+    # check pywb
+    t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url)
+    expected_payload = open(os.path.join(
+        os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
+    assert requests.get(
+            wb_url, allow_redirects=False).content == expected_payload
diff --git a/tests/test_units.py b/tests/test_units.py
new file mode 100644
index 0000000..2fee049
--- /dev/null
+++ b/tests/test_units.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+'''
+test_units.py - some unit tests for parts of brozzler amenable to that
+
+Copyright (C) 2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import pytest
+import http.server
+import threading
+import os
+import brozzler
+
+@pytest.fixture(scope='module')
+def httpd(request):
+    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
+    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+
+    httpd = http.server.HTTPServer(
+            ('localhost', 0), http.server.SimpleHTTPRequestHandler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    def fin():
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+    request.addfinalizer(fin)
+
+    return httpd
+
+def test_robots(httpd):
+    '''
+    Basic test of robots.txt user-agent substring matching.
+    '''
+    url = 'http://localhost:%s/' % httpd.server_port
+    site = brozzler.Site(seed=url, user_agent='im/a/GoOdbot/yep')
+    assert brozzler.is_permitted_by_robots(site, url)
+
+    site = brozzler.Site(seed=url, user_agent='im/a bAdBOt/uh huh')
+    assert not brozzler.is_permitted_by_robots(site, url)
+
diff --git a/vagrant/Vagrantfile b/vagrant/Vagrantfile
index 5a1c8ae..f983e7c 100644
--- a/vagrant/Vagrantfile
+++ b/vagrant/Vagrantfile
@@ -6,6 +6,11 @@ Vagrant.configure(2) do |config|
 
   config.vm.synced_folder "..", "/brozzler"
 
+  # bump up memory to avoid "can't start new thread" errors
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 1024
+  end
+
   config.vm.provision "ansible" do |ansible|
     ansible.inventory_path = "../ansible/hosts-vagrant"
     ansible.playbook = "../ansible/playbook.yml"
diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh
index 2a8a0b5..710438b 100755
--- a/vagrant/run-tests.sh
+++ b/vagrant/run-tests.sh
@@ -1,4 +1,9 @@
 #!/bin/bash
+#
+# any arguments are passed on to py.test
+# so for example to run only "test_obey_robots" you could run
+# ./run-tests.sh -k test_obey_robots
+#
 
 cd $(dirname "${BASH_SOURCE[0]}")
 
@@ -11,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
 echo
 
 vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
-vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
+vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"