From 3d47805ec1f7411efbabf98de3efef594730db79 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 27 Mar 2017 12:15:49 -0700
Subject: [PATCH] new model for crawling hashtags, each one is no longer a
 top-level page

---
 brozzler/browser.py           | 33 ++++++++++++++----
 brozzler/frontier.py          | 13 ++++++-
 brozzler/job.py               | 33 ++++++++++--------
 brozzler/worker.py            |  3 +-
 setup.py                      |  2 +-
 tests/htdocs/site7/boosh.txt  |  1 +
 tests/htdocs/site7/foo.html   | 36 ++++++++++++++++++++
 tests/htdocs/site7/index.html | 10 ++++++
 tests/htdocs/site7/whee.txt   |  1 +
 tests/test_cluster.py         | 47 +++++++++++++++++++++++++
 tests/test_frontier.py        | 64 +++++++++++++++++++++++++++++++++++
 vagrant/run-tests.sh          |  2 ++
 12 files changed, 220 insertions(+), 25 deletions(-)
 create mode 100644 tests/htdocs/site7/boosh.txt
 create mode 100644 tests/htdocs/site7/foo.html
 create mode 100644 tests/htdocs/site7/index.html
 create mode 100644 tests/htdocs/site7/whee.txt

diff --git a/brozzler/browser.py b/brozzler/browser.py
index 5b9924b..e5b236f 100644
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@@ -30,6 +30,7 @@ import datetime
 import base64
 from brozzler.chrome import Chrome
 import socket
+import urlcanon
 
 class BrowsingException(Exception):
     pass
@@ -374,7 +375,7 @@ class Browser:
             self, page_url, ignore_cert_errors=False, extra_headers=None,
             user_agent=None, behavior_parameters=None,
             on_request=None, on_response=None, on_screenshot=None,
-            username=None, password=None):
+            username=None, password=None, hashtags=None):
         '''
         Browses page in browser.
 
@@ -434,12 +435,7 @@ class Browser:
                     page_url, behavior_parameters)
             self.run_behavior(behavior_script, timeout=900)
             outlinks = self.extract_outlinks()
-            ## for each hashtag not already visited:
-            ##     navigate_to_hashtag (nothing to wait for so no timeout?)
-            ##     if on_screenshot;
-            ##         take screenshot (30 sec)
-            ##     run behavior (3 min)
-            ##     outlinks += retrieve_outlinks (60 sec)
+            self.visit_hashtags(page_url, hashtags, outlinks)
             final_page_url = self.url()
             return final_page_url, outlinks
         except brozzler.ReachedLimit:
@@ -454,6 +450,29 @@ class Browser:
             self.websock_thread.on_request = None
             self.websock_thread.on_response = None
 
+    def visit_hashtags(self, page_url, hashtags, outlinks):
+        _hashtags = set(hashtags or [])
+        for outlink in outlinks:
+            url = urlcanon.whatwg(outlink)
+            hashtag = (url.hash_sign + url.fragment).decode('utf-8')
+            urlcanon.canon.remove_fragment(url)
+            if hashtag and str(url) == page_url:
+                _hashtags.add(hashtag)
+        # could inject a script that listens for HashChangeEvent to figure
+        # out which hashtags were visited already and skip those
+        for hashtag in _hashtags:
+            # navigate_to_hashtag (nothing to wait for so no timeout?)
+            self.logger.debug('navigating to hashtag %s', hashtag)
+            url = urlcanon.whatwg(page_url)
+            url.hash_sign = b'#'
+            url.fragment = hashtag[1:].encode('utf-8')
+            self.send_to_chrome(
+                    method='Page.navigate', params={'url': str(url)})
+            time.sleep(5) # um.. wait for idleness or something?
+            # take another screenshot?
+            # run behavior again with short timeout?
+            # retrieve outlinks again and append to list?
+
     def navigate_to_page(
             self, page_url, extra_headers=None, user_agent=None, timeout=300):
         headers = extra_headers or {}
diff --git a/brozzler/frontier.py b/brozzler/frontier.py
index 215ee6c..700ec0d 100644
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@@ -266,8 +266,11 @@ class RethinkDbFrontier:
         for url in outlinks or []:
             url_for_scoping = urlcanon.semantic(url)
             url_for_crawling = urlcanon.whatwg(url)
+            hashtag = (url_for_crawling.hash_sign
+                       + url_for_crawling.fragment).decode('utf-8')
+            urlcanon.canon.remove_fragment(url_for_crawling)
             if site.is_in_scope(url_for_scoping, parent_page=parent_page):
-                if brozzler.is_permitted_by_robots(site, url):
+                if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                     if not url_for_scoping.surt().startswith(
                             site.scope["surt"].encode("utf-8")):
                         hops_off_surt = parent_page.hops_off_surt + 1
@@ -283,9 +286,17 @@ class RethinkDbFrontier:
                             self.rr, new_child_page.id)
                     if existing_child_page:
                         existing_child_page.priority += new_child_page.priority
+                        if hashtag and existing_child_page.hashtags:
+                            hashtags = set(existing_child_page.hashtags)
+                            hashtags.add(hashtag)
+                            existing_child_page.hashtags = list(hashtags)
+                        elif hashtag:
+                            existing_child_page.hashtags = [hashtag]
                         existing_child_page.save()
                         counts["updated"] += 1
                     else:
+                        if hashtag:
+                            new_child_page.hashtags = [hashtag,]
                         new_child_page.save()
                         counts["added"] += 1
                     decisions["accepted"].add(str(url_for_crawling))
diff --git a/brozzler/job.py b/brozzler/job.py
index 0120dcb..ac001f1 100644
--- a/brozzler/job.py
+++ b/brozzler/job.py
@@ -27,6 +27,7 @@ import doublethink
 import os
 import cerberus
 import urllib
+import urlcanon
 
 def load_schema():
     schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
@@ -94,22 +95,24 @@ def new_job(frontier, job_conf):
 def new_site(frontier, site):
     site.id = str(uuid.uuid4())
     logging.info("new site {}".format(site))
+    # insert the Page into the database before the Site, to avoid situation
+    # where a brozzler worker immediately claims the site, finds no pages
+    # to crawl, and decides the site is finished
     try:
-        # insert the Page into the database before the Site, to avoid situation
-        # where a brozzler worker immediately claims the site, finds no pages
-        # to crawl, and decides the site is finished
-        try:
-            page = brozzler.Page(frontier.rr, {
-                "url": site.seed, "site_id": site.get("id"),
-                "job_id": site.get("job_id"), "hops_from_seed": 0,
-                "priority": 1000, "needs_robots_check": True})
-            page.save()
-            logging.info("queued page %s", page)
-        finally:
-            # finally block because we want to insert the Site no matter what
-            site.save()
-    except brozzler.ReachedLimit as e:
-        frontier.reached_limit(site, e)
+        url = urlcanon.parse_url(site.seed)
+        hashtag = (url.hash_sign + url.fragment).decode("utf-8")
+        urlcanon.canon.remove_fragment(url)
+        page = brozzler.Page(frontier.rr, {
+            "url": str(url), "site_id": site.get("id"),
+            "job_id": site.get("job_id"), "hops_from_seed": 0,
+            "priority": 1000, "needs_robots_check": True})
+        if hashtag:
+            page.hashtags = [hashtag,]
+        page.save()
+        logging.info("queued page %s", page)
+    finally:
+        # finally block because we want to insert the Site no matter what
+        site.save()
 
 class Job(doublethink.Document):
     logger = logging.getLogger(__module__ + "." + __qualname__)
diff --git a/brozzler/worker.py b/brozzler/worker.py
index b9ad609..e604684 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -364,7 +364,8 @@ class BrozzlerWorker:
                 behavior_parameters=site.get('behavior_parameters'),
                 username=site.get('username'), password=site.get('password'),
                 user_agent=site.get('user_agent'),
-                on_screenshot=_on_screenshot, on_response=_on_response)
+                on_screenshot=_on_screenshot, on_response=_on_response,
+                hashtags=page.hashtags)
         if final_page_url != page.url:
             page.note_redirect(final_page_url)
         return outlinks
diff --git a/setup.py b/setup.py
index ad9eea4..d4d70b6 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@ def find_package_data(package):
 
 setuptools.setup(
         name='brozzler',
-        version='1.1b10.dev223',
+        version='1.1b10.dev224',
         description='Distributed web crawling with browsers',
         url='https://github.com/internetarchive/brozzler',
         author='Noah Levitt',
diff --git a/tests/htdocs/site7/boosh.txt b/tests/htdocs/site7/boosh.txt
new file mode 100644
index 0000000..8a95b88
--- /dev/null
+++ b/tests/htdocs/site7/boosh.txt
@@ -0,0 +1 @@
+I AM A POINTED LITTLE FILE
diff --git a/tests/htdocs/site7/foo.html b/tests/htdocs/site7/foo.html
new file mode 100644
index 0000000..7d5837f
--- /dev/null
+++ b/tests/htdocs/site7/foo.html
@@ -0,0 +1,36 @@
+<html>
+    <head>
+        <title>hashtag url test</title>
+        <script>
+            (function() {
+                let lastHash = null;
+                setInterval(function() {
+                    const hash = new URL(document.URL).hash;
+                    if (hash != lastHash && (hash == '#whee' || hash == '#boosh')) {
+                        lastHash = hash;
+
+                        const httpRequest = new XMLHttpRequest();
+                        httpRequest.onreadystatechange = function() {
+                            if (httpRequest.readyState === XMLHttpRequest.DONE) {
+                                const e = document.createElement('p');
+                                e.textContent = 'loaded from ' + hash.substring(1) + ': ' + httpRequest.responseText;
+                                document.body.appendChild(e);
+                            }
+                        };
+
+                        httpRequest.open('GET', hash.substring(1) + '.txt', true);
+                        httpRequest.send(null);
+                    }
+                }, 1000);
+            })();
+        </script>
+    </head>
+    <body>
+        <h1>hashtag url test</h1>
+        <div><a href='#boosh'>#boosh</a></div>
+        <div><a href='#ignored'>#ignored</a></div>
+        <p>this page will ajax load ./whee.txt if it notices the url in the
+        location bar has fragment "#whee", and ./boosh.txt if it notices
+        "#boosh"</p>
+    </body>
+</html>
diff --git a/tests/htdocs/site7/index.html b/tests/htdocs/site7/index.html
new file mode 100644
index 0000000..4cd9491
--- /dev/null
+++ b/tests/htdocs/site7/index.html
@@ -0,0 +1,10 @@
+<html>
+    <head>
+        <title>link to hashtag url test</title>
+        <script>
+        </script>
+    </head>
+    <body>
+        <a href="foo.html#whee">foo.html#whee</a>
+    </body>
+</html>
diff --git a/tests/htdocs/site7/whee.txt b/tests/htdocs/site7/whee.txt
new file mode 100644
index 0000000..c979c72
--- /dev/null
+++ b/tests/htdocs/site7/whee.txt
@@ -0,0 +1 @@
+I AM A POINTLESS LITTLE FILE
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 0f1e9e6..750315a 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -453,3 +453,50 @@ def test_seed_redirect(httpd):
 
     # check that scope has been updated properly
     assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
+
+def test_hashtags(httpd):
+    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
+    rr = doublethink.Rethinker('localhost', db='brozzler')
+    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
+    site = brozzler.Site(rr, {
+        'seed': seed_url,
+        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
+
+    frontier = brozzler.RethinkDbFrontier(rr)
+    brozzler.new_site(frontier, site)
+    assert site.id
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site.refresh()
+    assert site.status == 'FINISHED'
+
+    # check that we the page we expected
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+    assert len(pages) == 2
+    assert pages[0].url == seed_url
+    assert pages[0].hops_from_seed == 0
+    assert pages[0].brozzle_count == 1
+    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
+    assert not pages[0].hashtags
+    assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
+    assert pages[1].hops_from_seed == 1
+    assert pages[1].brozzle_count == 1
+    assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
+
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
+    # take a look at the captures table
+    captures = rr.table('captures').filter({'test_id':test_id}).run()
+    captures_by_url = {
+            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
+    assert seed_url in captures_by_url
+    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
+    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
+    assert 'screenshot:%s' % seed_url in captures_by_url
+    assert 'thumbnail:%s' % seed_url in captures_by_url
+    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+
diff --git a/tests/test_frontier.py b/tests/test_frontier.py
index af4b9a6..c4166bc 100644
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@@ -591,3 +591,67 @@ def test_seed_page():
     page0.save()
 
     assert frontier.seed_page(site.id) == page0
+
+def test_hashtag_seed():
+    rr = doublethink.Rethinker('localhost', db='ignoreme')
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    # no hash tag
+    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
+    brozzler.new_site(frontier, site)
+
+    assert site.scope['surt'] == 'http://(org,example,)/'
+
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 1
+    assert pages[0].url == 'http://example.org/'
+    assert not pages[0].hashtags
+
+    # yes hash tag
+    site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
+    brozzler.new_site(frontier, site)
+
+    assert site.scope['surt'] == 'http://(org,example,)/'
+
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 1
+    assert pages[0].url == 'http://example.org/'
+    assert pages[0].hashtags == ['#hash',]
+
+def test_hashtag_links():
+    rr = doublethink.Rethinker('localhost', db='ignoreme')
+    frontier = brozzler.RethinkDbFrontier(rr)
+
+    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
+    brozzler.new_site(frontier, site)
+    parent_page = frontier.seed_page(site.id)
+    assert not parent_page.hashtags
+    outlinks = [
+        'http://example.org/#foo',
+        'http://example.org/bar',
+        'http://example.org/bar#baz',
+        'http://example.org/bar#quux',
+        'http://example.org/zuh#buh',
+    ]
+    frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
+
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+    assert len(pages) == 3
+    assert pages[0].url == 'http://example.org/'
+    assert sorted(pages[0].outlinks['accepted']) == [
+            'http://example.org/', 'http://example.org/bar',
+            'http://example.org/zuh']
+    assert not pages[0].outlinks['blocked']
+    assert not pages[0].outlinks['rejected']
+    assert pages[0].hashtags == ['#foo',]
+    assert pages[0].hops_from_seed == 0
+
+    assert pages[1].url == 'http://example.org/bar'
+    assert sorted(pages[1].hashtags) == ['#baz','#quux']
+    assert pages[1].priority == 36
+    assert pages[1].hops_from_seed == 1
+
+    assert pages[2].url == 'http://example.org/zuh'
+    assert pages[2].hashtags == ['#buh']
+    assert pages[2].priority == 12
+
diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh
index 5058e62..122286d 100755
--- a/vagrant/run-tests.sh
+++ b/vagrant/run-tests.sh
@@ -7,6 +7,8 @@
 
 cd $(dirname "${BASH_SOURCE[0]}")
 
+vagrant up
+
 echo service status:
 vagrant ssh -- 'status warcprox ;
                 status Xvnc ;