new model for crawling hashtags, each one is no longer a top-level page

2025-10-03 08:58:39 -04:00 · 2017-03-27 12:15:49 -07:00 · 2017-03-27 12:15:49 -07:00 · 3d47805ec1
commit 3d47805ec1
parent a836269e95
12 changed files with 220 additions and 25 deletions
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -30,6 +30,7 @@ import datetime
 import base64
 from brozzler.chrome import Chrome
 import socket
+import urlcanon

 class BrowsingException(Exception):
    pass
@ -374,7 +375,7 @@ class Browser:
            self, page_url, ignore_cert_errors=False, extra_headers=None,
            user_agent=None, behavior_parameters=None,
            on_request=None, on_response=None, on_screenshot=None,
-            username=None, password=None):
+            username=None, password=None, hashtags=None):
        '''
        Browses page in browser.

@ -434,12 +435,7 @@ class Browser:
                    page_url, behavior_parameters)
            self.run_behavior(behavior_script, timeout=900)
            outlinks = self.extract_outlinks()
-            ## for each hashtag not already visited:
-            ##     navigate_to_hashtag (nothing to wait for so no timeout?)
-            ##     if on_screenshot;
-            ##         take screenshot (30 sec)
-            ##     run behavior (3 min)
-            ##     outlinks += retrieve_outlinks (60 sec)
+            self.visit_hashtags(page_url, hashtags, outlinks)
            final_page_url = self.url()
            return final_page_url, outlinks
        except brozzler.ReachedLimit:
@ -454,6 +450,29 @@ class Browser:
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None

+    def visit_hashtags(self, page_url, hashtags, outlinks):
+        _hashtags = set(hashtags or [])
+        for outlink in outlinks:
+            url = urlcanon.whatwg(outlink)
+            hashtag = (url.hash_sign + url.fragment).decode('utf-8')
+            urlcanon.canon.remove_fragment(url)
+            if hashtag and str(url) == page_url:
+                _hashtags.add(hashtag)
+        # could inject a script that listens for HashChangeEvent to figure
+        # out which hashtags were visited already and skip those
+        for hashtag in _hashtags:
+            # navigate_to_hashtag (nothing to wait for so no timeout?)
+            self.logger.debug('navigating to hashtag %s', hashtag)
+            url = urlcanon.whatwg(page_url)
+            url.hash_sign = b'#'
+            url.fragment = hashtag[1:].encode('utf-8')
+            self.send_to_chrome(
+                    method='Page.navigate', params={'url': str(url)})
+            time.sleep(5) # um.. wait for idleness or something?
+            # take another screenshot?
+            # run behavior again with short timeout?
+            # retrieve outlinks again and append to list?
+
    def navigate_to_page(
            self, page_url, extra_headers=None, user_agent=None, timeout=300):
        headers = extra_headers or {}
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -266,8 +266,11 @@ class RethinkDbFrontier:
        for url in outlinks or []:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
+            hashtag = (url_for_crawling.hash_sign
+                       + url_for_crawling.fragment).decode('utf-8')
+            urlcanon.canon.remove_fragment(url_for_crawling)
            if site.is_in_scope(url_for_scoping, parent_page=parent_page):
-                if brozzler.is_permitted_by_robots(site, url):
+                if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                    if not url_for_scoping.surt().startswith(
                            site.scope["surt"].encode("utf-8")):
                        hops_off_surt = parent_page.hops_off_surt + 1
@ -283,9 +286,17 @@ class RethinkDbFrontier:
                            self.rr, new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
+                        if hashtag and existing_child_page.hashtags:
+                            hashtags = set(existing_child_page.hashtags)
+                            hashtags.add(hashtag)
+                            existing_child_page.hashtags = list(hashtags)
+                        elif hashtag:
+                            existing_child_page.hashtags = [hashtag]
                        existing_child_page.save()
                        counts["updated"] += 1
                    else:
+                        if hashtag:
+                            new_child_page.hashtags = [hashtag,]
                        new_child_page.save()
                        counts["added"] += 1
                    decisions["accepted"].add(str(url_for_crawling))
--- a/brozzler/job.py
+++ b/brozzler/job.py
@ -27,6 +27,7 @@ import doublethink
 import os
 import cerberus
 import urllib
+import urlcanon

 def load_schema():
    schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
@ -94,22 +95,24 @@ def new_job(frontier, job_conf):
 def new_site(frontier, site):
    site.id = str(uuid.uuid4())
    logging.info("new site {}".format(site))
+    # insert the Page into the database before the Site, to avoid situation
+    # where a brozzler worker immediately claims the site, finds no pages
+    # to crawl, and decides the site is finished
    try:
-        # insert the Page into the database before the Site, to avoid situation
-        # where a brozzler worker immediately claims the site, finds no pages
-        # to crawl, and decides the site is finished
-        try:
-            page = brozzler.Page(frontier.rr, {
-                "url": site.seed, "site_id": site.get("id"),
-                "job_id": site.get("job_id"), "hops_from_seed": 0,
-                "priority": 1000, "needs_robots_check": True})
-            page.save()
-            logging.info("queued page %s", page)
-        finally:
-            # finally block because we want to insert the Site no matter what
-            site.save()
-    except brozzler.ReachedLimit as e:
-        frontier.reached_limit(site, e)
+        url = urlcanon.parse_url(site.seed)
+        hashtag = (url.hash_sign + url.fragment).decode("utf-8")
+        urlcanon.canon.remove_fragment(url)
+        page = brozzler.Page(frontier.rr, {
+            "url": str(url), "site_id": site.get("id"),
+            "job_id": site.get("job_id"), "hops_from_seed": 0,
+            "priority": 1000, "needs_robots_check": True})
+        if hashtag:
+            page.hashtags = [hashtag,]
+        page.save()
+        logging.info("queued page %s", page)
+    finally:
+        # finally block because we want to insert the Site no matter what
+        site.save()

 class Job(doublethink.Document):
    logger = logging.getLogger(__module__ + "." + __qualname__)
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -364,7 +364,8 @@ class BrozzlerWorker:
                behavior_parameters=site.get('behavior_parameters'),
                username=site.get('username'), password=site.get('password'),
                user_agent=site.get('user_agent'),
-                on_screenshot=_on_screenshot, on_response=_on_response)
+                on_screenshot=_on_screenshot, on_response=_on_response,
+                hashtags=page.hashtags)
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b10.dev223',
+        version='1.1b10.dev224',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/htdocs/site7/boosh.txt
+++ b/tests/htdocs/site7/boosh.txt
@ -0,0 +1 @@
+I AM A POINTED LITTLE FILE
--- a/tests/htdocs/site7/foo.html
+++ b/tests/htdocs/site7/foo.html
@ -0,0 +1,36 @@
+<html>
+    <head>
+        <title>hashtag url test</title>
+        <script>
+            (function() {
+                let lastHash = null;
+                setInterval(function() {
+                    const hash = new URL(document.URL).hash;
+                    if (hash != lastHash && (hash == '#whee' || hash == '#boosh')) {
+                        lastHash = hash;
+
+                        const httpRequest = new XMLHttpRequest();
+                        httpRequest.onreadystatechange = function() {
+                            if (httpRequest.readyState === XMLHttpRequest.DONE) {
+                                const e = document.createElement('p');
+                                e.textContent = 'loaded from ' + hash.substring(1) + ': ' + httpRequest.responseText;
+                                document.body.appendChild(e);
+                            }
+                        };
+
+                        httpRequest.open('GET', hash.substring(1) + '.txt', true);
+                        httpRequest.send(null);
+                    }
+                }, 1000);
+            })();
+        </script>
+    </head>
+    <body>
+        <h1>hashtag url test</h1>
+        <div><a href='#boosh'>#boosh</a></div>
+        <div><a href='#ignored'>#ignored</a></div>
+        <p>this page will ajax load ./whee.txt if it notices the url in the
+        location bar has fragment "#whee", and ./boosh.txt if it notices
+        "#boosh"</p>
+    </body>
+</html>
--- a/tests/htdocs/site7/index.html
+++ b/tests/htdocs/site7/index.html
@ -0,0 +1,10 @@
+<html>
+    <head>
+        <title>link to hashtag url test</title>
+        <script>
+        </script>
+    </head>
+    <body>
+        <a href="foo.html#whee">foo.html#whee</a>
+    </body>
+</html>
--- a/tests/htdocs/site7/whee.txt
+++ b/tests/htdocs/site7/whee.txt
@ -0,0 +1 @@
+I AM A POINTLESS LITTLE FILE
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -453,3 +453,50 @@ def test_seed_redirect(httpd):

    # check that scope has been updated properly
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
+
+def test_hashtags(httpd):
+    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
+    rr = doublethink.Rethinker('localhost', db='brozzler')
+    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
+    site = brozzler.Site(rr, {
+        'seed': seed_url,
+        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
+
+    frontier = brozzler.RethinkDbFrontier(rr)
+    brozzler.new_site(frontier, site)
+    assert site.id
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site.refresh()
+    assert site.status == 'FINISHED'
+
+    # check that we the page we expected
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+    assert len(pages) == 2
+    assert pages[0].url == seed_url
+    assert pages[0].hops_from_seed == 0
+    assert pages[0].brozzle_count == 1
+    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
+    assert not pages[0].hashtags
+    assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
+    assert pages[1].hops_from_seed == 1
+    assert pages[1].brozzle_count == 1
+    assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
+
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
+    # take a look at the captures table
+    captures = rr.table('captures').filter({'test_id':test_id}).run()
+    captures_by_url = {
+            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
+    assert seed_url in captures_by_url
+    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
+    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
+    assert 'screenshot:%s' % seed_url in captures_by_url
+    assert 'thumbnail:%s' % seed_url in captures_by_url
+    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -591,3 +591,67 @@ def test_seed_page():
    page0.save()

    assert frontier.seed_page(site.id) == page0
+
+def test_hashtag_seed():
+    rr = doublethink.Rethinker('localhost', db='ignoreme')
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    # no hash tag
+    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
+    brozzler.new_site(frontier, site)
+
+    assert site.scope['surt'] == 'http://(org,example,)/'
+
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 1
+    assert pages[0].url == 'http://example.org/'
+    assert not pages[0].hashtags
+
+    # yes hash tag
+    site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
+    brozzler.new_site(frontier, site)
+
+    assert site.scope['surt'] == 'http://(org,example,)/'
+
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 1
+    assert pages[0].url == 'http://example.org/'
+    assert pages[0].hashtags == ['#hash',]
+
+def test_hashtag_links():
+    rr = doublethink.Rethinker('localhost', db='ignoreme')
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
+    brozzler.new_site(frontier, site)
+    parent_page = frontier.seed_page(site.id)
+    assert not parent_page.hashtags
+    outlinks = [
+        'http://example.org/#foo',
+        'http://example.org/bar',
+        'http://example.org/bar#baz',
+        'http://example.org/bar#quux',
+        'http://example.org/zuh#buh',
+    ]
+    frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
+
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+    assert len(pages) == 3
+    assert pages[0].url == 'http://example.org/'
+    assert sorted(pages[0].outlinks['accepted']) == [
+            'http://example.org/', 'http://example.org/bar',
+            'http://example.org/zuh']
+    assert not pages[0].outlinks['blocked']
+    assert not pages[0].outlinks['rejected']
+    assert pages[0].hashtags == ['#foo',]
+    assert pages[0].hops_from_seed == 0
+
+    assert pages[1].url == 'http://example.org/bar'
+    assert sorted(pages[1].hashtags) == ['#baz','#quux']
+    assert pages[1].priority == 36
+    assert pages[1].hops_from_seed == 1
+
+    assert pages[2].url == 'http://example.org/zuh'
+    assert pages[2].hashtags == ['#buh']
+    assert pages[2].priority == 12
+
--- a/vagrant/run-tests.sh
+++ b/vagrant/run-tests.sh
@ -7,6 +7,8 @@

 cd $(dirname "${BASH_SOURCE[0]}")

+vagrant up
+
 echo service status:
 vagrant ssh -- 'status warcprox ;
                status Xvnc ;