Merge branch 'master' into qa

* master: fix problem where each hashtag visited causes a page load if page url redirects new test exposing problem where each hashtag visited causes a page load, if page redirects
2025-12-15 08:38:52 -05:00 · 2017-09-27 14:11:30 -07:00 · 2017-09-27 14:11:30 -07:00 · 9422fb6a26
commit 9422fb6a26
parent 407735efc4 ec847e48bc
6 changed files with 82 additions and 4 deletions
--- a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2
+++ b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2
@ -21,4 +21,5 @@ kill timeout 60
 exec nice brozzler-worker \
    --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
    --max-browsers=4 \
+    --verbose \
    --warcprox-auto
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -454,7 +454,7 @@ class Browser:
                else:
                    outlinks = self.extract_outlinks()
                if not skip_visit_hashtags:
-                    self.visit_hashtags(page_url, hashtags, outlinks)
+                    self.visit_hashtags(self.url(), hashtags, outlinks)
                final_page_url = self.url()
                return final_page_url, outlinks
        except brozzler.ReachedLimit:
@ -599,8 +599,7 @@ class Browser:

    def try_login(self, username, password, timeout=300):
        try_login_js = brozzler.jinja2_environment().get_template(
-                'try-login.js.j2').render(
-                        username=username, password=password)
+                'try-login.js.j2').render(username=username, password=password)

        self.websock_thread.got_page_load_event = None
        self.send_to_chrome(
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b12.dev266',
+        version='1.1b12.dev268',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/htdocs/site9/destination.html
+++ b/tests/htdocs/site9/destination.html
@ -0,0 +1,8 @@
+<html>
+    <head>
+        <title>redirected page with hashtags test</title>
+    </head>
+    <body>
+        nothin
+    </body>
+</html>
--- a/tests/htdocs/site9/index.html
+++ b/tests/htdocs/site9/index.html
@ -0,0 +1,9 @@
+<html>
+    <head>
+        <title>redirected page with hashtags test</title>
+    </head>
+    <body>
+        <a href="redirect.html#hash1">redirect.html#hash1</a>
+        <a href="redirect.html#hash2">redirect.html#hash2</a>
+    </body>
+</html>
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -51,6 +51,13 @@ def httpd(request):
                self.send_header('Location', '/site5/destination/')
                self.end_headers()
                self.wfile.write(b'')
+            elif self.path == '/site9/redirect.html':
+                self.send_response(303, 'See other')
+                self.send_header('Connection', 'close')
+                self.send_header('Content-Length', 0)
+                self.send_header('Location', '/site9/destination.html')
+                self.end_headers()
+                self.wfile.write(b'')
            elif self.path.startswith('/infinite/'):
                payload = b'''
 <html>
@ -519,6 +526,60 @@ def test_hashtags(httpd):
    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url

+def test_redirect_hashtags(httpd):
+    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
+    rr = doublethink.Rethinker('localhost', db='brozzler')
+    seed_url = 'http://localhost:%s/site9/' % httpd.server_port
+    site = brozzler.Site(rr, {
+        'seed': seed_url,
+        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
+
+    frontier = brozzler.RethinkDbFrontier(rr)
+    brozzler.new_site(frontier, site)
+    assert site.id
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site.refresh()
+    assert site.status == 'FINISHED'
+
+    # check that we the page we expected
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+    assert len(pages) == 2
+    assert pages[0].url == seed_url
+    assert pages[0].hops_from_seed == 0
+    assert pages[0].brozzle_count == 1
+    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port]
+    assert not pages[0].hashtags
+    assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port
+    assert pages[1].hops_from_seed == 1
+    assert pages[1].brozzle_count == 1
+    assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]
+
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
+    # take a look at the captures table
+    captures = rr.table('captures').filter({'test_id':test_id}).run()
+    redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET']
+    assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
+
+    # === expected captures ===
+    #  1. GET http://localhost:41243/favicon.ico
+    #  2. GET http://localhost:41243/robots.txt
+    #  3. GET http://localhost:41243/site9/
+    #  4. GET http://localhost:41243/site9/
+    #  5. GET http://localhost:41243/site9/destination.html
+    #  6. GET http://localhost:41243/site9/destination.html
+    #  7. GET http://localhost:41243/site9/redirect.html
+    #  8. GET http://localhost:41243/site9/redirect.html
+    #  9. HEAD http://localhost:41243/site9/
+    # 10. HEAD http://localhost:41243/site9/redirect.html
+    # 11. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/
+    # 12. WARCPROX_WRITE_RECORD screenshot:http://localhost:41243/site9/redirect.html
+    # 13. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/
+    # 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
+
 def test_stop_crawl(httpd):
    test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')