diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2
index ccb6716..3fd73d6 100644
--- a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2
+++ b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2
@@ -21,4 +21,5 @@ kill timeout 60
exec nice brozzler-worker \
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
--max-browsers=4 \
+ --verbose \
--warcprox-auto
diff --git a/brozzler/browser.py b/brozzler/browser.py
index 09131f1..1a6d5e9 100644
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@@ -454,7 +454,7 @@ class Browser:
else:
outlinks = self.extract_outlinks()
if not skip_visit_hashtags:
- self.visit_hashtags(page_url, hashtags, outlinks)
+ self.visit_hashtags(self.url(), hashtags, outlinks)
final_page_url = self.url()
return final_page_url, outlinks
except brozzler.ReachedLimit:
@@ -599,8 +599,7 @@ class Browser:
def try_login(self, username, password, timeout=300):
try_login_js = brozzler.jinja2_environment().get_template(
- 'try-login.js.j2').render(
- username=username, password=password)
+ 'try-login.js.j2').render(username=username, password=password)
self.websock_thread.got_page_load_event = None
self.send_to_chrome(
diff --git a/setup.py b/setup.py
index f9ab4e2..651eb66 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
- version='1.1b12.dev266',
+ version='1.1b12.dev268',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
diff --git a/tests/htdocs/site9/destination.html b/tests/htdocs/site9/destination.html
new file mode 100644
index 0000000..e68c344
--- /dev/null
+++ b/tests/htdocs/site9/destination.html
@@ -0,0 +1,8 @@
+
+
+ redirected page with hashtags test
+
+
+ nothin
+
+
diff --git a/tests/htdocs/site9/index.html b/tests/htdocs/site9/index.html
new file mode 100644
index 0000000..6f9808d
--- /dev/null
+++ b/tests/htdocs/site9/index.html
@@ -0,0 +1,9 @@
+
+
+ redirected page with hashtags test
+
+
+ redirect.html#hash1
+ redirect.html#hash2
+
+
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 59b289c..48a9384 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -51,6 +51,13 @@ def httpd(request):
self.send_header('Location', '/site5/destination/')
self.end_headers()
self.wfile.write(b'')
+ elif self.path == '/site9/redirect.html':
+ self.send_response(303, 'See other')
+ self.send_header('Connection', 'close')
+ self.send_header('Content-Length', 0)
+ self.send_header('Location', '/site9/destination.html')
+ self.end_headers()
+ self.wfile.write(b'')
elif self.path.startswith('/infinite/'):
payload = b'''
@@ -519,6 +526,60 @@ def test_hashtags(httpd):
assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+def test_redirect_hashtags(httpd):
+ test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
+ rr = doublethink.Rethinker('localhost', db='brozzler')
+ seed_url = 'http://localhost:%s/site9/' % httpd.server_port
+ site = brozzler.Site(rr, {
+ 'seed': seed_url,
+ 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
+
+ frontier = brozzler.RethinkDbFrontier(rr)
+ brozzler.new_site(frontier, site)
+ assert site.id
+
+ # the site should be brozzled fairly quickly
+ start = time.time()
+ while site.status != 'FINISHED' and time.time() - start < 300:
+ time.sleep(0.5)
+ site.refresh()
+ assert site.status == 'FINISHED'
+
+ # check that we the page we expected
+ pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+ assert len(pages) == 2
+ assert pages[0].url == seed_url
+ assert pages[0].hops_from_seed == 0
+ assert pages[0].brozzle_count == 1
+ assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port]
+ assert not pages[0].hashtags
+ assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port
+ assert pages[1].hops_from_seed == 1
+ assert pages[1].brozzle_count == 1
+ assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]
+
+ time.sleep(2) # in case warcprox hasn't finished processing urls
+ # take a look at the captures table
+ captures = rr.table('captures').filter({'test_id':test_id}).run()
+ redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET']
+ assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
+
+ # === expected captures ===
+ # 1. GET http://localhost:41243/favicon.ico
+ # 2. GET http://localhost:41243/robots.txt
+ # 3. GET http://localhost:41243/site9/
+ # 4. GET http://localhost:41243/site9/
+ # 5. GET http://localhost:41243/site9/destination.html
+ # 6. GET http://localhost:41243/site9/destination.html
+ # 7. GET http://localhost:41243/site9/redirect.html
+ # 8. GET http://localhost:41243/site9/redirect.html
+ # 9. HEAD http://localhost:41243/site9/
+ # 10. HEAD http://localhost:41243/site9/redirect.html
+ # 11. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/
+ # 12. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/redirect.html
+ # 13. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/
+ # 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
+
def test_stop_crawl(httpd):
test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')