new model for crawling hashtags, each one is no longer a top-level page

2025-12-14 16:19:00 -05:00 · 2017-03-27 12:15:49 -07:00 · 2017-03-27 12:15:49 -07:00 · 3d47805ec1
commit 3d47805ec1
parent a836269e95
12 changed files with 220 additions and 25 deletions
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -453,3 +453,50 @@ def test_seed_redirect(httpd):

    # check that scope has been updated properly
    assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
+
+def test_hashtags(httpd):
+    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
+    rr = doublethink.Rethinker('localhost', db='brozzler')
+    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
+    site = brozzler.Site(rr, {
+        'seed': seed_url,
+        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
+
+    frontier = brozzler.RethinkDbFrontier(rr)
+    brozzler.new_site(frontier, site)
+    assert site.id
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site.refresh()
+    assert site.status == 'FINISHED'
+
+    # check that we the page we expected
+    pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
+    assert len(pages) == 2
+    assert pages[0].url == seed_url
+    assert pages[0].hops_from_seed == 0
+    assert pages[0].brozzle_count == 1
+    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
+    assert not pages[0].hashtags
+    assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
+    assert pages[1].hops_from_seed == 1
+    assert pages[1].brozzle_count == 1
+    assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
+
+    time.sleep(2)   # in case warcprox hasn't finished processing urls
+    # take a look at the captures table
+    captures = rr.table('captures').filter({'test_id':test_id}).run()
+    captures_by_url = {
+            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
+    assert seed_url in captures_by_url
+    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
+    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
+    assert 'screenshot:%s' % seed_url in captures_by_url
+    assert 'thumbnail:%s' % seed_url in captures_by_url
+    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+