new model for crawling hashtags, each one is no longer a top-level page

This commit is contained in:
Noah Levitt 2017-03-27 12:15:49 -07:00
parent a836269e95
commit 3d47805ec1
12 changed files with 220 additions and 25 deletions

View File

@ -30,6 +30,7 @@ import datetime
import base64 import base64
from brozzler.chrome import Chrome from brozzler.chrome import Chrome
import socket import socket
import urlcanon
class BrowsingException(Exception): class BrowsingException(Exception):
pass pass
@ -374,7 +375,7 @@ class Browser:
self, page_url, ignore_cert_errors=False, extra_headers=None, self, page_url, ignore_cert_errors=False, extra_headers=None,
user_agent=None, behavior_parameters=None, user_agent=None, behavior_parameters=None,
on_request=None, on_response=None, on_screenshot=None, on_request=None, on_response=None, on_screenshot=None,
username=None, password=None): username=None, password=None, hashtags=None):
''' '''
Browses page in browser. Browses page in browser.
@ -434,12 +435,7 @@ class Browser:
page_url, behavior_parameters) page_url, behavior_parameters)
self.run_behavior(behavior_script, timeout=900) self.run_behavior(behavior_script, timeout=900)
outlinks = self.extract_outlinks() outlinks = self.extract_outlinks()
## for each hashtag not already visited: self.visit_hashtags(page_url, hashtags, outlinks)
## navigate_to_hashtag (nothing to wait for so no timeout?)
## if on_screenshot;
## take screenshot (30 sec)
## run behavior (3 min)
## outlinks += retrieve_outlinks (60 sec)
final_page_url = self.url() final_page_url = self.url()
return final_page_url, outlinks return final_page_url, outlinks
except brozzler.ReachedLimit: except brozzler.ReachedLimit:
@ -454,6 +450,29 @@ class Browser:
self.websock_thread.on_request = None self.websock_thread.on_request = None
self.websock_thread.on_response = None self.websock_thread.on_response = None
def visit_hashtags(self, page_url, hashtags, outlinks):
_hashtags = set(hashtags or [])
for outlink in outlinks:
url = urlcanon.whatwg(outlink)
hashtag = (url.hash_sign + url.fragment).decode('utf-8')
urlcanon.canon.remove_fragment(url)
if hashtag and str(url) == page_url:
_hashtags.add(hashtag)
# could inject a script that listens for HashChangeEvent to figure
# out which hashtags were visited already and skip those
for hashtag in _hashtags:
# navigate_to_hashtag (nothing to wait for so no timeout?)
self.logger.debug('navigating to hashtag %s', hashtag)
url = urlcanon.whatwg(page_url)
url.hash_sign = b'#'
url.fragment = hashtag[1:].encode('utf-8')
self.send_to_chrome(
method='Page.navigate', params={'url': str(url)})
time.sleep(5) # um.. wait for idleness or something?
# take another screenshot?
# run behavior again with short timeout?
# retrieve outlinks again and append to list?
def navigate_to_page( def navigate_to_page(
self, page_url, extra_headers=None, user_agent=None, timeout=300): self, page_url, extra_headers=None, user_agent=None, timeout=300):
headers = extra_headers or {} headers = extra_headers or {}

View File

@ -266,8 +266,11 @@ class RethinkDbFrontier:
for url in outlinks or []: for url in outlinks or []:
url_for_scoping = urlcanon.semantic(url) url_for_scoping = urlcanon.semantic(url)
url_for_crawling = urlcanon.whatwg(url) url_for_crawling = urlcanon.whatwg(url)
hashtag = (url_for_crawling.hash_sign
+ url_for_crawling.fragment).decode('utf-8')
urlcanon.canon.remove_fragment(url_for_crawling)
if site.is_in_scope(url_for_scoping, parent_page=parent_page): if site.is_in_scope(url_for_scoping, parent_page=parent_page):
if brozzler.is_permitted_by_robots(site, url): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
if not url_for_scoping.surt().startswith( if not url_for_scoping.surt().startswith(
site.scope["surt"].encode("utf-8")): site.scope["surt"].encode("utf-8")):
hops_off_surt = parent_page.hops_off_surt + 1 hops_off_surt = parent_page.hops_off_surt + 1
@ -283,9 +286,17 @@ class RethinkDbFrontier:
self.rr, new_child_page.id) self.rr, new_child_page.id)
if existing_child_page: if existing_child_page:
existing_child_page.priority += new_child_page.priority existing_child_page.priority += new_child_page.priority
if hashtag and existing_child_page.hashtags:
hashtags = set(existing_child_page.hashtags)
hashtags.add(hashtag)
existing_child_page.hashtags = list(hashtags)
elif hashtag:
existing_child_page.hashtags = [hashtag]
existing_child_page.save() existing_child_page.save()
counts["updated"] += 1 counts["updated"] += 1
else: else:
if hashtag:
new_child_page.hashtags = [hashtag,]
new_child_page.save() new_child_page.save()
counts["added"] += 1 counts["added"] += 1
decisions["accepted"].add(str(url_for_crawling)) decisions["accepted"].add(str(url_for_crawling))

View File

@ -27,6 +27,7 @@ import doublethink
import os import os
import cerberus import cerberus
import urllib import urllib
import urlcanon
def load_schema(): def load_schema():
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml') schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
@ -94,22 +95,24 @@ def new_job(frontier, job_conf):
def new_site(frontier, site): def new_site(frontier, site):
site.id = str(uuid.uuid4()) site.id = str(uuid.uuid4())
logging.info("new site {}".format(site)) logging.info("new site {}".format(site))
# insert the Page into the database before the Site, to avoid situation
# where a brozzler worker immediately claims the site, finds no pages
# to crawl, and decides the site is finished
try: try:
# insert the Page into the database before the Site, to avoid situation url = urlcanon.parse_url(site.seed)
# where a brozzler worker immediately claims the site, finds no pages hashtag = (url.hash_sign + url.fragment).decode("utf-8")
# to crawl, and decides the site is finished urlcanon.canon.remove_fragment(url)
try: page = brozzler.Page(frontier.rr, {
page = brozzler.Page(frontier.rr, { "url": str(url), "site_id": site.get("id"),
"url": site.seed, "site_id": site.get("id"), "job_id": site.get("job_id"), "hops_from_seed": 0,
"job_id": site.get("job_id"), "hops_from_seed": 0, "priority": 1000, "needs_robots_check": True})
"priority": 1000, "needs_robots_check": True}) if hashtag:
page.save() page.hashtags = [hashtag,]
logging.info("queued page %s", page) page.save()
finally: logging.info("queued page %s", page)
# finally block because we want to insert the Site no matter what finally:
site.save() # finally block because we want to insert the Site no matter what
except brozzler.ReachedLimit as e: site.save()
frontier.reached_limit(site, e)
class Job(doublethink.Document): class Job(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)

View File

@ -364,7 +364,8 @@ class BrozzlerWorker:
behavior_parameters=site.get('behavior_parameters'), behavior_parameters=site.get('behavior_parameters'),
username=site.get('username'), password=site.get('password'), username=site.get('username'), password=site.get('password'),
user_agent=site.get('user_agent'), user_agent=site.get('user_agent'),
on_screenshot=_on_screenshot, on_response=_on_response) on_screenshot=_on_screenshot, on_response=_on_response,
hashtags=page.hashtags)
if final_page_url != page.url: if final_page_url != page.url:
page.note_redirect(final_page_url) page.note_redirect(final_page_url)
return outlinks return outlinks

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b10.dev223', version='1.1b10.dev224',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View File

@ -0,0 +1 @@
I AM A POINTED LITTLE FILE

View File

@ -0,0 +1,36 @@
<html>
<head>
<title>hashtag url test</title>
<script>
(function() {
let lastHash = null;
setInterval(function() {
const hash = new URL(document.URL).hash;
if (hash != lastHash && (hash == '#whee' || hash == '#boosh')) {
lastHash = hash;
const httpRequest = new XMLHttpRequest();
httpRequest.onreadystatechange = function() {
if (httpRequest.readyState === XMLHttpRequest.DONE) {
const e = document.createElement('p');
e.textContent = 'loaded from ' + hash.substring(1) + ': ' + httpRequest.responseText;
document.body.appendChild(e);
}
};
httpRequest.open('GET', hash.substring(1) + '.txt', true);
httpRequest.send(null);
}
}, 1000);
})();
</script>
</head>
<body>
<h1>hashtag url test</h1>
<div><a href='#boosh'>#boosh</a></div>
<div><a href='#ignored'>#ignored</a></div>
<p>this page will ajax load ./whee.txt if it notices the url in the
location bar has fragment "#whee", and ./boosh.txt if it notices
"#boosh"</p>
</body>
</html>

View File

@ -0,0 +1,10 @@
<html>
<head>
<title>link to hashtag url test</title>
<script>
</script>
</head>
<body>
<a href="foo.html#whee">foo.html#whee</a>
</body>
</html>

View File

@ -0,0 +1 @@
I AM A POINTLESS LITTLE FILE

View File

@ -453,3 +453,50 @@ def test_seed_redirect(httpd):
# check that scope has been updated properly # check that scope has been updated properly
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
def test_hashtags(httpd):
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site7/' % httpd.server_port
site = brozzler.Site(rr, {
'seed': seed_url,
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
frontier = brozzler.RethinkDbFrontier(rr)
brozzler.new_site(frontier, site)
assert site.id
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site.refresh()
assert site.status == 'FINISHED'
# check that we the page we expected
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
assert len(pages) == 2
assert pages[0].url == seed_url
assert pages[0].hops_from_seed == 0
assert pages[0].brozzle_count == 1
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
assert not pages[0].hashtags
assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
assert pages[1].hops_from_seed == 1
assert pages[1].brozzle_count == 1
assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = rr.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert seed_url in captures_by_url
assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
assert 'screenshot:%s' % seed_url in captures_by_url
assert 'thumbnail:%s' % seed_url in captures_by_url
assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url

View File

@ -591,3 +591,67 @@ def test_seed_page():
page0.save() page0.save()
assert frontier.seed_page(site.id) == page0 assert frontier.seed_page(site.id) == page0
def test_hashtag_seed():
rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
# no hash tag
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
brozzler.new_site(frontier, site)
assert site.scope['surt'] == 'http://(org,example,)/'
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
assert pages[0].url == 'http://example.org/'
assert not pages[0].hashtags
# yes hash tag
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
brozzler.new_site(frontier, site)
assert site.scope['surt'] == 'http://(org,example,)/'
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
assert pages[0].url == 'http://example.org/'
assert pages[0].hashtags == ['#hash',]
def test_hashtag_links():
rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
brozzler.new_site(frontier, site)
parent_page = frontier.seed_page(site.id)
assert not parent_page.hashtags
outlinks = [
'http://example.org/#foo',
'http://example.org/bar',
'http://example.org/bar#baz',
'http://example.org/bar#quux',
'http://example.org/zuh#buh',
]
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
assert len(pages) == 3
assert pages[0].url == 'http://example.org/'
assert sorted(pages[0].outlinks['accepted']) == [
'http://example.org/', 'http://example.org/bar',
'http://example.org/zuh']
assert not pages[0].outlinks['blocked']
assert not pages[0].outlinks['rejected']
assert pages[0].hashtags == ['#foo',]
assert pages[0].hops_from_seed == 0
assert pages[1].url == 'http://example.org/bar'
assert sorted(pages[1].hashtags) == ['#baz','#quux']
assert pages[1].priority == 36
assert pages[1].hops_from_seed == 1
assert pages[2].url == 'http://example.org/zuh'
assert pages[2].hashtags == ['#buh']
assert pages[2].priority == 12

View File

@ -7,6 +7,8 @@
cd $(dirname "${BASH_SOURCE[0]}") cd $(dirname "${BASH_SOURCE[0]}")
vagrant up
echo service status: echo service status:
vagrant ssh -- 'status warcprox ; vagrant ssh -- 'status warcprox ;
status Xvnc ; status Xvnc ;