From 3d47805ec1f7411efbabf98de3efef594730db79 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 27 Mar 2017 12:15:49 -0700 Subject: [PATCH] new model for crawling hashtags, each one is no longer a top-level page --- brozzler/browser.py | 33 ++++++++++++++---- brozzler/frontier.py | 13 ++++++- brozzler/job.py | 33 ++++++++++-------- brozzler/worker.py | 3 +- setup.py | 2 +- tests/htdocs/site7/boosh.txt | 1 + tests/htdocs/site7/foo.html | 36 ++++++++++++++++++++ tests/htdocs/site7/index.html | 10 ++++++ tests/htdocs/site7/whee.txt | 1 + tests/test_cluster.py | 47 +++++++++++++++++++++++++ tests/test_frontier.py | 64 +++++++++++++++++++++++++++++++++++ vagrant/run-tests.sh | 2 ++ 12 files changed, 220 insertions(+), 25 deletions(-) create mode 100644 tests/htdocs/site7/boosh.txt create mode 100644 tests/htdocs/site7/foo.html create mode 100644 tests/htdocs/site7/index.html create mode 100644 tests/htdocs/site7/whee.txt diff --git a/brozzler/browser.py b/brozzler/browser.py index 5b9924b..e5b236f 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -30,6 +30,7 @@ import datetime import base64 from brozzler.chrome import Chrome import socket +import urlcanon class BrowsingException(Exception): pass @@ -374,7 +375,7 @@ class Browser: self, page_url, ignore_cert_errors=False, extra_headers=None, user_agent=None, behavior_parameters=None, on_request=None, on_response=None, on_screenshot=None, - username=None, password=None): + username=None, password=None, hashtags=None): ''' Browses page in browser. @@ -434,12 +435,7 @@ class Browser: page_url, behavior_parameters) self.run_behavior(behavior_script, timeout=900) outlinks = self.extract_outlinks() - ## for each hashtag not already visited: - ## navigate_to_hashtag (nothing to wait for so no timeout?) - ## if on_screenshot; - ## take screenshot (30 sec) - ## run behavior (3 min) - ## outlinks += retrieve_outlinks (60 sec) + self.visit_hashtags(page_url, hashtags, outlinks) final_page_url = self.url() return final_page_url, outlinks except brozzler.ReachedLimit: @@ -454,6 +450,29 @@ class Browser: self.websock_thread.on_request = None self.websock_thread.on_response = None + def visit_hashtags(self, page_url, hashtags, outlinks): + _hashtags = set(hashtags or []) + for outlink in outlinks: + url = urlcanon.whatwg(outlink) + hashtag = (url.hash_sign + url.fragment).decode('utf-8') + urlcanon.canon.remove_fragment(url) + if hashtag and str(url) == page_url: + _hashtags.add(hashtag) + # could inject a script that listens for HashChangeEvent to figure + # out which hashtags were visited already and skip those + for hashtag in _hashtags: + # navigate_to_hashtag (nothing to wait for so no timeout?) + self.logger.debug('navigating to hashtag %s', hashtag) + url = urlcanon.whatwg(page_url) + url.hash_sign = b'#' + url.fragment = hashtag[1:].encode('utf-8') + self.send_to_chrome( + method='Page.navigate', params={'url': str(url)}) + time.sleep(5) # um.. wait for idleness or something? + # take another screenshot? + # run behavior again with short timeout? + # retrieve outlinks again and append to list? + def navigate_to_page( self, page_url, extra_headers=None, user_agent=None, timeout=300): headers = extra_headers or {} diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 215ee6c..700ec0d 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -266,8 +266,11 @@ class RethinkDbFrontier: for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) + hashtag = (url_for_crawling.hash_sign + + url_for_crawling.fragment).decode('utf-8') + urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): - if brozzler.is_permitted_by_robots(site, url): + if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): if not url_for_scoping.surt().startswith( site.scope["surt"].encode("utf-8")): hops_off_surt = parent_page.hops_off_surt + 1 @@ -283,9 +286,17 @@ class RethinkDbFrontier: self.rr, new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority + if hashtag and existing_child_page.hashtags: + hashtags = set(existing_child_page.hashtags) + hashtags.add(hashtag) + existing_child_page.hashtags = list(hashtags) + elif hashtag: + existing_child_page.hashtags = [hashtag] existing_child_page.save() counts["updated"] += 1 else: + if hashtag: + new_child_page.hashtags = [hashtag,] new_child_page.save() counts["added"] += 1 decisions["accepted"].add(str(url_for_crawling)) diff --git a/brozzler/job.py b/brozzler/job.py index 0120dcb..ac001f1 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -27,6 +27,7 @@ import doublethink import os import cerberus import urllib +import urlcanon def load_schema(): schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml') @@ -94,22 +95,24 @@ def new_job(frontier, job_conf): def new_site(frontier, site): site.id = str(uuid.uuid4()) logging.info("new site {}".format(site)) + # insert the Page into the database before the Site, to avoid situation + # where a brozzler worker immediately claims the site, finds no pages + # to crawl, and decides the site is finished try: - # insert the Page into the database before the Site, to avoid situation - # where a brozzler worker immediately claims the site, finds no pages - # to crawl, and decides the site is finished - try: - page = brozzler.Page(frontier.rr, { - "url": site.seed, "site_id": site.get("id"), - "job_id": site.get("job_id"), "hops_from_seed": 0, - "priority": 1000, "needs_robots_check": True}) - page.save() - logging.info("queued page %s", page) - finally: - # finally block because we want to insert the Site no matter what - site.save() - except brozzler.ReachedLimit as e: - frontier.reached_limit(site, e) + url = urlcanon.parse_url(site.seed) + hashtag = (url.hash_sign + url.fragment).decode("utf-8") + urlcanon.canon.remove_fragment(url) + page = brozzler.Page(frontier.rr, { + "url": str(url), "site_id": site.get("id"), + "job_id": site.get("job_id"), "hops_from_seed": 0, + "priority": 1000, "needs_robots_check": True}) + if hashtag: + page.hashtags = [hashtag,] + page.save() + logging.info("queued page %s", page) + finally: + # finally block because we want to insert the Site no matter what + site.save() class Job(doublethink.Document): logger = logging.getLogger(__module__ + "." + __qualname__) diff --git a/brozzler/worker.py b/brozzler/worker.py index b9ad609..e604684 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -364,7 +364,8 @@ class BrozzlerWorker: behavior_parameters=site.get('behavior_parameters'), username=site.get('username'), password=site.get('password'), user_agent=site.get('user_agent'), - on_screenshot=_on_screenshot, on_response=_on_response) + on_screenshot=_on_screenshot, on_response=_on_response, + hashtags=page.hashtags) if final_page_url != page.url: page.note_redirect(final_page_url) return outlinks diff --git a/setup.py b/setup.py index ad9eea4..d4d70b6 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b10.dev223', + version='1.1b10.dev224', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/htdocs/site7/boosh.txt b/tests/htdocs/site7/boosh.txt new file mode 100644 index 0000000..8a95b88 --- /dev/null +++ b/tests/htdocs/site7/boosh.txt @@ -0,0 +1 @@ +I AM A POINTED LITTLE FILE diff --git a/tests/htdocs/site7/foo.html b/tests/htdocs/site7/foo.html new file mode 100644 index 0000000..7d5837f --- /dev/null +++ b/tests/htdocs/site7/foo.html @@ -0,0 +1,36 @@ + + + hashtag url test + + + +

hashtag url test

+
#boosh
+
#ignored
+

this page will ajax load ./whee.txt if it notices the url in the + location bar has fragment "#whee", and ./boosh.txt if it notices + "#boosh"

+ + diff --git a/tests/htdocs/site7/index.html b/tests/htdocs/site7/index.html new file mode 100644 index 0000000..4cd9491 --- /dev/null +++ b/tests/htdocs/site7/index.html @@ -0,0 +1,10 @@ + + + link to hashtag url test + + + + foo.html#whee + + diff --git a/tests/htdocs/site7/whee.txt b/tests/htdocs/site7/whee.txt new file mode 100644 index 0000000..c979c72 --- /dev/null +++ b/tests/htdocs/site7/whee.txt @@ -0,0 +1 @@ +I AM A POINTLESS LITTLE FILE diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 0f1e9e6..750315a 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -453,3 +453,50 @@ def test_seed_redirect(httpd): # check that scope has been updated properly assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port + +def test_hashtags(httpd): + test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker('localhost', db='brozzler') + seed_url = 'http://localhost:%s/site7/' % httpd.server_port + site = brozzler.Site(rr, { + 'seed': seed_url, + 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + + frontier = brozzler.RethinkDbFrontier(rr) + brozzler.new_site(frontier, site) + assert site.id + + # the site should be brozzled fairly quickly + start = time.time() + while site.status != 'FINISHED' and time.time() - start < 300: + time.sleep(0.5) + site.refresh() + assert site.status == 'FINISHED' + + # check that we the page we expected + pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) + assert len(pages) == 2 + assert pages[0].url == seed_url + assert pages[0].hops_from_seed == 0 + assert pages[0].brozzle_count == 1 + assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port] + assert not pages[0].hashtags + assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port + assert pages[1].hops_from_seed == 1 + assert pages[1].brozzle_count == 1 + assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',] + + time.sleep(2) # in case warcprox hasn't finished processing urls + # take a look at the captures table + captures = rr.table('captures').filter({'test_id':test_id}).run() + captures_by_url = { + c['url']: c for c in captures if c['http_method'] != 'HEAD'} + assert seed_url in captures_by_url + assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url + assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url + assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url + assert 'screenshot:%s' % seed_url in captures_by_url + assert 'thumbnail:%s' % seed_url in captures_by_url + assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url + assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url + diff --git a/tests/test_frontier.py b/tests/test_frontier.py index af4b9a6..c4166bc 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -591,3 +591,67 @@ def test_seed_page(): page0.save() assert frontier.seed_page(site.id) == page0 + +def test_hashtag_seed(): + rr = doublethink.Rethinker('localhost', db='ignoreme') + frontier = brozzler.RethinkDbFrontier(rr) + + # no hash tag + site = brozzler.Site(rr, {'seed': 'http://example.org/'}) + brozzler.new_site(frontier, site) + + assert site.scope['surt'] == 'http://(org,example,)/' + + pages = list(frontier.site_pages(site.id)) + assert len(pages) == 1 + assert pages[0].url == 'http://example.org/' + assert not pages[0].hashtags + + # yes hash tag + site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) + brozzler.new_site(frontier, site) + + assert site.scope['surt'] == 'http://(org,example,)/' + + pages = list(frontier.site_pages(site.id)) + assert len(pages) == 1 + assert pages[0].url == 'http://example.org/' + assert pages[0].hashtags == ['#hash',] + +def test_hashtag_links(): + rr = doublethink.Rethinker('localhost', db='ignoreme') + frontier = brozzler.RethinkDbFrontier(rr) + + site = brozzler.Site(rr, {'seed': 'http://example.org/'}) + brozzler.new_site(frontier, site) + parent_page = frontier.seed_page(site.id) + assert not parent_page.hashtags + outlinks = [ + 'http://example.org/#foo', + 'http://example.org/bar', + 'http://example.org/bar#baz', + 'http://example.org/bar#quux', + 'http://example.org/zuh#buh', + ] + frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) + + pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) + assert len(pages) == 3 + assert pages[0].url == 'http://example.org/' + assert sorted(pages[0].outlinks['accepted']) == [ + 'http://example.org/', 'http://example.org/bar', + 'http://example.org/zuh'] + assert not pages[0].outlinks['blocked'] + assert not pages[0].outlinks['rejected'] + assert pages[0].hashtags == ['#foo',] + assert pages[0].hops_from_seed == 0 + + assert pages[1].url == 'http://example.org/bar' + assert sorted(pages[1].hashtags) == ['#baz','#quux'] + assert pages[1].priority == 36 + assert pages[1].hops_from_seed == 1 + + assert pages[2].url == 'http://example.org/zuh' + assert pages[2].hashtags == ['#buh'] + assert pages[2].priority == 12 + diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh index 5058e62..122286d 100755 --- a/vagrant/run-tests.sh +++ b/vagrant/run-tests.sh @@ -7,6 +7,8 @@ cd $(dirname "${BASH_SOURCE[0]}") +vagrant up + echo service status: vagrant ssh -- 'status warcprox ; status Xvnc ;