diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 index ccb6716..3fd73d6 100644 --- a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 +++ b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 @@ -21,4 +21,5 @@ kill timeout 60 exec nice brozzler-worker \ --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \ --max-browsers=4 \ + --verbose \ --warcprox-auto diff --git a/brozzler/browser.py b/brozzler/browser.py index 09131f1..1a6d5e9 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -454,7 +454,7 @@ class Browser: else: outlinks = self.extract_outlinks() if not skip_visit_hashtags: - self.visit_hashtags(page_url, hashtags, outlinks) + self.visit_hashtags(self.url(), hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: @@ -599,8 +599,7 @@ class Browser: def try_login(self, username, password, timeout=300): try_login_js = brozzler.jinja2_environment().get_template( - 'try-login.js.j2').render( - username=username, password=password) + 'try-login.js.j2').render(username=username, password=password) self.websock_thread.got_page_load_event = None self.send_to_chrome( diff --git a/setup.py b/setup.py index f9ab4e2..651eb66 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev266', + version='1.1b12.dev268', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/htdocs/site9/destination.html b/tests/htdocs/site9/destination.html new file mode 100644 index 0000000..e68c344 --- /dev/null +++ b/tests/htdocs/site9/destination.html @@ -0,0 +1,8 @@ + + + redirected page with hashtags test + + + nothin + + diff --git a/tests/htdocs/site9/index.html b/tests/htdocs/site9/index.html new file mode 100644 index 0000000..6f9808d --- /dev/null +++ b/tests/htdocs/site9/index.html @@ -0,0 +1,9 @@ + + + redirected page with hashtags test + + + redirect.html#hash1 + redirect.html#hash2 + + diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 59b289c..48a9384 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -51,6 +51,13 @@ def httpd(request): self.send_header('Location', '/site5/destination/') self.end_headers() self.wfile.write(b'') + elif self.path == '/site9/redirect.html': + self.send_response(303, 'See other') + self.send_header('Connection', 'close') + self.send_header('Content-Length', 0) + self.send_header('Location', '/site9/destination.html') + self.end_headers() + self.wfile.write(b'') elif self.path.startswith('/infinite/'): payload = b''' @@ -519,6 +526,60 @@ def test_hashtags(httpd): assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url +def test_redirect_hashtags(httpd): + test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker('localhost', db='brozzler') + seed_url = 'http://localhost:%s/site9/' % httpd.server_port + site = brozzler.Site(rr, { + 'seed': seed_url, + 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + + frontier = brozzler.RethinkDbFrontier(rr) + brozzler.new_site(frontier, site) + assert site.id + + # the site should be brozzled fairly quickly + start = time.time() + while site.status != 'FINISHED' and time.time() - start < 300: + time.sleep(0.5) + site.refresh() + assert site.status == 'FINISHED' + + # check that we the page we expected + pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) + assert len(pages) == 2 + assert pages[0].url == seed_url + assert pages[0].hops_from_seed == 0 + assert pages[0].brozzle_count == 1 + assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port] + assert not pages[0].hashtags + assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port + assert pages[1].hops_from_seed == 1 + assert pages[1].brozzle_count == 1 + assert sorted(pages[1].hashtags) == ['#hash1','#hash2',] + + time.sleep(2) # in case warcprox hasn't finished processing urls + # take a look at the captures table + captures = rr.table('captures').filter({'test_id':test_id}).run() + redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET'] + assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags + + # === expected captures === + # 1. GET http://localhost:41243/favicon.ico + # 2. GET http://localhost:41243/robots.txt + # 3. GET http://localhost:41243/site9/ + # 4. GET http://localhost:41243/site9/ + # 5. GET http://localhost:41243/site9/destination.html + # 6. GET http://localhost:41243/site9/destination.html + # 7. GET http://localhost:41243/site9/redirect.html + # 8. GET http://localhost:41243/site9/redirect.html + # 9. HEAD http://localhost:41243/site9/ + # 10. HEAD http://localhost:41243/site9/redirect.html + # 11. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/ + # 12. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/redirect.html + # 13. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/ + # 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html + def test_stop_crawl(httpd): test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler')