new model for crawling hashtags, each one is no longer a top-level page

2025-08-06 21:44:29 -04:00 · 2017-03-27 12:15:49 -07:00 · 2017-03-27 12:15:49 -07:00 · 3d47805ec1
commit 3d47805ec1
parent a836269e95
12 changed files with 220 additions and 25 deletions
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -30,6 +30,7 @@ import datetime
 import base64
 from brozzler.chrome import Chrome
 import socket
 import urlcanon
 class BrowsingException(Exception):
    pass
@ -374,7 +375,7 @@ class Browser:
            self, page_url, ignore_cert_errors=False, extra_headers=None,
            user_agent=None, behavior_parameters=None,
            on_request=None, on_response=None, on_screenshot=None,
-            username=None, password=None):
+            username=None, password=None, hashtags=None):
        '''
        Browses page in browser.
@ -434,12 +435,7 @@ class Browser:
                    page_url, behavior_parameters)
            self.run_behavior(behavior_script, timeout=900)
            outlinks = self.extract_outlinks()
-            ## for each hashtag not already visited:
+            self.visit_hashtags(page_url, hashtags, outlinks)
            ##     navigate_to_hashtag (nothing to wait for so no timeout?)
            ##     if on_screenshot;
            ##         take screenshot (30 sec)
            ##     run behavior (3 min)
            ##     outlinks += retrieve_outlinks (60 sec)
            final_page_url = self.url()
            return final_page_url, outlinks
        except brozzler.ReachedLimit:
@ -454,6 +450,29 @@ class Browser:
            self.websock_thread.on_request = None
            self.websock_thread.on_response = None
    def visit_hashtags(self, page_url, hashtags, outlinks):
        _hashtags = set(hashtags or [])
        for outlink in outlinks:
            url = urlcanon.whatwg(outlink)
            hashtag = (url.hash_sign + url.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url)
            if hashtag and str(url) == page_url:
                _hashtags.add(hashtag)
        # could inject a script that listens for HashChangeEvent to figure
        # out which hashtags were visited already and skip those
        for hashtag in _hashtags:
            # navigate_to_hashtag (nothing to wait for so no timeout?)
            self.logger.debug('navigating to hashtag %s', hashtag)
            url = urlcanon.whatwg(page_url)
            url.hash_sign = b'#'
            url.fragment = hashtag[1:].encode('utf-8')
            self.send_to_chrome(
                    method='Page.navigate', params={'url': str(url)})
            time.sleep(5) # um.. wait for idleness or something?
            # take another screenshot?
            # run behavior again with short timeout?
            # retrieve outlinks again and append to list?
    def navigate_to_page(
            self, page_url, extra_headers=None, user_agent=None, timeout=300):
        headers = extra_headers or {}
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -266,8 +266,11 @@ class RethinkDbFrontier:
        for url in outlinks or []:
            url_for_scoping = urlcanon.semantic(url)
            url_for_crawling = urlcanon.whatwg(url)
            hashtag = (url_for_crawling.hash_sign
                       + url_for_crawling.fragment).decode('utf-8')
            urlcanon.canon.remove_fragment(url_for_crawling)
            if site.is_in_scope(url_for_scoping, parent_page=parent_page):
-                if brozzler.is_permitted_by_robots(site, url):
+                if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
                    if not url_for_scoping.surt().startswith(
                            site.scope["surt"].encode("utf-8")):
                        hops_off_surt = parent_page.hops_off_surt + 1
@ -283,9 +286,17 @@ class RethinkDbFrontier:
                            self.rr, new_child_page.id)
                    if existing_child_page:
                        existing_child_page.priority += new_child_page.priority
                        if hashtag and existing_child_page.hashtags:
                            hashtags = set(existing_child_page.hashtags)
                            hashtags.add(hashtag)
                            existing_child_page.hashtags = list(hashtags)
                        elif hashtag:
                            existing_child_page.hashtags = [hashtag]
                        existing_child_page.save()
                        counts["updated"] += 1
                    else:
                        if hashtag:
                            new_child_page.hashtags = [hashtag,]
                        new_child_page.save()
                        counts["added"] += 1
                    decisions["accepted"].add(str(url_for_crawling))
--- a/brozzler/job.py
+++ b/brozzler/job.py
@ -27,6 +27,7 @@ import doublethink
 import os
 import cerberus
 import urllib
 import urlcanon
 def load_schema():
    schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
@ -94,22 +95,24 @@ def new_job(frontier, job_conf):
 def new_site(frontier, site):
    site.id = str(uuid.uuid4())
    logging.info("new site {}".format(site))
    # insert the Page into the database before the Site, to avoid situation
    # where a brozzler worker immediately claims the site, finds no pages
    # to crawl, and decides the site is finished
    try:
-        # insert the Page into the database before the Site, to avoid situation
+        url = urlcanon.parse_url(site.seed)
-        # where a brozzler worker immediately claims the site, finds no pages
+        hashtag = (url.hash_sign + url.fragment).decode("utf-8")
-        # to crawl, and decides the site is finished
+        urlcanon.canon.remove_fragment(url)
-        try:
+        page = brozzler.Page(frontier.rr, {
-            page = brozzler.Page(frontier.rr, {
+            "url": str(url), "site_id": site.get("id"),
-                "url": site.seed, "site_id": site.get("id"),
+            "job_id": site.get("job_id"), "hops_from_seed": 0,
-                "job_id": site.get("job_id"), "hops_from_seed": 0,
+            "priority": 1000, "needs_robots_check": True})
-                "priority": 1000, "needs_robots_check": True})
+        if hashtag:
-            page.save()
+            page.hashtags = [hashtag,]
-            logging.info("queued page %s", page)
+        page.save()
-        finally:
+        logging.info("queued page %s", page)
-            # finally block because we want to insert the Site no matter what
+    finally:
-            site.save()
+        # finally block because we want to insert the Site no matter what
-    except brozzler.ReachedLimit as e:
+        site.save()
        frontier.reached_limit(site, e)
 class Job(doublethink.Document):
    logger = logging.getLogger(__module__ + "." + __qualname__)
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -364,7 +364,8 @@ class BrozzlerWorker:
                behavior_parameters=site.get('behavior_parameters'),
                username=site.get('username'), password=site.get('password'),
                user_agent=site.get('user_agent'),
-                on_screenshot=_on_screenshot, on_response=_on_response)
+                on_screenshot=_on_screenshot, on_response=_on_response,
                hashtags=page.hashtags)
        if final_page_url != page.url:
            page.note_redirect(final_page_url)
        return outlinks
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
 setuptools.setup(
        name='brozzler',
-        version='1.1b10.dev223',
+        version='1.1b10.dev224',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/htdocs/site7/boosh.txt
+++ b/tests/htdocs/site7/boosh.txt
@ -0,0 +1 @@
 I AM A POINTED LITTLE FILE
--- a/tests/htdocs/site7/foo.html
+++ b/tests/htdocs/site7/foo.html
@ -0,0 +1,36 @@
 <html>
    <head>
        <title>hashtag url test</title>
        <script>
            (function() {
                let lastHash = null;
                setInterval(function() {
                    const hash = new URL(document.URL).hash;
                    if (hash != lastHash && (hash == '#whee' || hash == '#boosh')) {
                        lastHash = hash;
                        const httpRequest = new XMLHttpRequest();
                        httpRequest.onreadystatechange = function() {
                            if (httpRequest.readyState === XMLHttpRequest.DONE) {
                                const e = document.createElement('p');
                                e.textContent = 'loaded from ' + hash.substring(1) + ': ' + httpRequest.responseText;
                                document.body.appendChild(e);
                            }
                        };
                        httpRequest.open('GET', hash.substring(1) + '.txt', true);
                        httpRequest.send(null);
                    }
                }, 1000);
            })();
        </script>
    </head>
    <body>
        <h1>hashtag url test</h1>
        <div><a href='#boosh'>#boosh</a></div>
        <div><a href='#ignored'>#ignored</a></div>
        <p>this page will ajax load ./whee.txt if it notices the url in the
        location bar has fragment "#whee", and ./boosh.txt if it notices
        "#boosh"</p>
    </body>
 </html>
--- a/tests/htdocs/site7/index.html
+++ b/tests/htdocs/site7/index.html
@ -0,0 +1,10 @@
 <html>
    <head>
        <title>link to hashtag url test</title>
        <script>
        </script>
    </head>
    <body>
        <a href="foo.html#whee">foo.html#whee</a>
    </body>
 </html>
--- a/tests/htdocs/site7/whee.txt
+++ b/tests/htdocs/site7/whee.txt
@ -0,0 +1 @@
 I AM A POINTLESS LITTLE FILE
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -453,3 +453,50 @@ def test_seed_redirect(httpd):
    # check that scope has been updated properly
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
 def test_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
    site = brozzler.Site(rr, {
        'seed': seed_url,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id
    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site.refresh()
    assert site.status == 'FINISHED'
    # check that we the page we expected
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 2
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
    assert not pages[0].hashtags
    assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    assert seed_url in captures_by_url
    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
    assert 'screenshot:%s' % seed_url in captures_by_url
    assert 'thumbnail:%s' % seed_url in captures_by_url
    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
--- a/tests/test_frontier.py
+++ b/tests/test_frontier.py
@ -591,3 +591,67 @@ def test_seed_page():
    page0.save()
    assert frontier.seed_page(site.id) == page0
 def test_hashtag_seed():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    # no hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)
    assert site.scope['surt'] == 'http://(org,example,)/'
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert not pages[0].hashtags
    # yes hash tag
    site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
    brozzler.new_site(frontier, site)
    assert site.scope['surt'] == 'http://(org,example,)/'
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    assert pages[0].url == 'http://example.org/'
    assert pages[0].hashtags == ['#hash',]
 def test_hashtag_links():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {'seed': 'http://example.org/'})
    brozzler.new_site(frontier, site)
    parent_page = frontier.seed_page(site.id)
    assert not parent_page.hashtags
    outlinks = [
        'http://example.org/#foo',
        'http://example.org/bar',
        'http://example.org/bar#baz',
        'http://example.org/bar#quux',
        'http://example.org/zuh#buh',
    ]
    frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
    assert len(pages) == 3
    assert pages[0].url == 'http://example.org/'
    assert sorted(pages[0].outlinks['accepted']) == [
            'http://example.org/', 'http://example.org/bar',
            'http://example.org/zuh']
    assert not pages[0].outlinks['blocked']
    assert not pages[0].outlinks['rejected']
    assert pages[0].hashtags == ['#foo',]
    assert pages[0].hops_from_seed == 0
    assert pages[1].url == 'http://example.org/bar'
    assert sorted(pages[1].hashtags) == ['#baz','#quux']
    assert pages[1].priority == 36
    assert pages[1].hops_from_seed == 1
    assert pages[2].url == 'http://example.org/zuh'
    assert pages[2].hashtags == ['#buh']
    assert pages[2].priority == 12
--- a/vagrant/run-tests.sh
+++ b/vagrant/run-tests.sh
@ -7,6 +7,8 @@
 cd $(dirname "${BASH_SOURCE[0]}")
 vagrant up
 echo service status:
 vagrant ssh -- 'status warcprox ;
                status Xvnc ;