mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 08:09:48 -05:00
new model for crawling hashtags, each one is no longer a top-level page
This commit is contained in:
parent
a836269e95
commit
3d47805ec1
@ -30,6 +30,7 @@ import datetime
|
||||
import base64
|
||||
from brozzler.chrome import Chrome
|
||||
import socket
|
||||
import urlcanon
|
||||
|
||||
class BrowsingException(Exception):
|
||||
pass
|
||||
@ -374,7 +375,7 @@ class Browser:
|
||||
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
||||
user_agent=None, behavior_parameters=None,
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
username=None, password=None):
|
||||
username=None, password=None, hashtags=None):
|
||||
'''
|
||||
Browses page in browser.
|
||||
|
||||
@ -434,12 +435,7 @@ class Browser:
|
||||
page_url, behavior_parameters)
|
||||
self.run_behavior(behavior_script, timeout=900)
|
||||
outlinks = self.extract_outlinks()
|
||||
## for each hashtag not already visited:
|
||||
## navigate_to_hashtag (nothing to wait for so no timeout?)
|
||||
## if on_screenshot;
|
||||
## take screenshot (30 sec)
|
||||
## run behavior (3 min)
|
||||
## outlinks += retrieve_outlinks (60 sec)
|
||||
self.visit_hashtags(page_url, hashtags, outlinks)
|
||||
final_page_url = self.url()
|
||||
return final_page_url, outlinks
|
||||
except brozzler.ReachedLimit:
|
||||
@ -454,6 +450,29 @@ class Browser:
|
||||
self.websock_thread.on_request = None
|
||||
self.websock_thread.on_response = None
|
||||
|
||||
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||
_hashtags = set(hashtags or [])
|
||||
for outlink in outlinks:
|
||||
url = urlcanon.whatwg(outlink)
|
||||
hashtag = (url.hash_sign + url.fragment).decode('utf-8')
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
if hashtag and str(url) == page_url:
|
||||
_hashtags.add(hashtag)
|
||||
# could inject a script that listens for HashChangeEvent to figure
|
||||
# out which hashtags were visited already and skip those
|
||||
for hashtag in _hashtags:
|
||||
# navigate_to_hashtag (nothing to wait for so no timeout?)
|
||||
self.logger.debug('navigating to hashtag %s', hashtag)
|
||||
url = urlcanon.whatwg(page_url)
|
||||
url.hash_sign = b'#'
|
||||
url.fragment = hashtag[1:].encode('utf-8')
|
||||
self.send_to_chrome(
|
||||
method='Page.navigate', params={'url': str(url)})
|
||||
time.sleep(5) # um.. wait for idleness or something?
|
||||
# take another screenshot?
|
||||
# run behavior again with short timeout?
|
||||
# retrieve outlinks again and append to list?
|
||||
|
||||
def navigate_to_page(
|
||||
self, page_url, extra_headers=None, user_agent=None, timeout=300):
|
||||
headers = extra_headers or {}
|
||||
|
@ -266,8 +266,11 @@ class RethinkDbFrontier:
|
||||
for url in outlinks or []:
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
url_for_crawling = urlcanon.whatwg(url)
|
||||
hashtag = (url_for_crawling.hash_sign
|
||||
+ url_for_crawling.fragment).decode('utf-8')
|
||||
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
|
||||
if brozzler.is_permitted_by_robots(site, url):
|
||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||
if not url_for_scoping.surt().startswith(
|
||||
site.scope["surt"].encode("utf-8")):
|
||||
hops_off_surt = parent_page.hops_off_surt + 1
|
||||
@ -283,9 +286,17 @@ class RethinkDbFrontier:
|
||||
self.rr, new_child_page.id)
|
||||
if existing_child_page:
|
||||
existing_child_page.priority += new_child_page.priority
|
||||
if hashtag and existing_child_page.hashtags:
|
||||
hashtags = set(existing_child_page.hashtags)
|
||||
hashtags.add(hashtag)
|
||||
existing_child_page.hashtags = list(hashtags)
|
||||
elif hashtag:
|
||||
existing_child_page.hashtags = [hashtag]
|
||||
existing_child_page.save()
|
||||
counts["updated"] += 1
|
||||
else:
|
||||
if hashtag:
|
||||
new_child_page.hashtags = [hashtag,]
|
||||
new_child_page.save()
|
||||
counts["added"] += 1
|
||||
decisions["accepted"].add(str(url_for_crawling))
|
||||
|
@ -27,6 +27,7 @@ import doublethink
|
||||
import os
|
||||
import cerberus
|
||||
import urllib
|
||||
import urlcanon
|
||||
|
||||
def load_schema():
|
||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||
@ -94,22 +95,24 @@ def new_job(frontier, job_conf):
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
logging.info("new site {}".format(site))
|
||||
# insert the Page into the database before the Site, to avoid situation
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
try:
|
||||
# insert the Page into the database before the Site, to avoid situation
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
try:
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": site.seed, "site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||
"priority": 1000, "needs_robots_check": True})
|
||||
page.save()
|
||||
logging.info("queued page %s", page)
|
||||
finally:
|
||||
# finally block because we want to insert the Site no matter what
|
||||
site.save()
|
||||
except brozzler.ReachedLimit as e:
|
||||
frontier.reached_limit(site, e)
|
||||
url = urlcanon.parse_url(site.seed)
|
||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": str(url), "site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||
"priority": 1000, "needs_robots_check": True})
|
||||
if hashtag:
|
||||
page.hashtags = [hashtag,]
|
||||
page.save()
|
||||
logging.info("queued page %s", page)
|
||||
finally:
|
||||
# finally block because we want to insert the Site no matter what
|
||||
site.save()
|
||||
|
||||
class Job(doublethink.Document):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
@ -364,7 +364,8 @@ class BrozzlerWorker:
|
||||
behavior_parameters=site.get('behavior_parameters'),
|
||||
username=site.get('username'), password=site.get('password'),
|
||||
user_agent=site.get('user_agent'),
|
||||
on_screenshot=_on_screenshot, on_response=_on_response)
|
||||
on_screenshot=_on_screenshot, on_response=_on_response,
|
||||
hashtags=page.hashtags)
|
||||
if final_page_url != page.url:
|
||||
page.note_redirect(final_page_url)
|
||||
return outlinks
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b10.dev223',
|
||||
version='1.1b10.dev224',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
1
tests/htdocs/site7/boosh.txt
Normal file
1
tests/htdocs/site7/boosh.txt
Normal file
@ -0,0 +1 @@
|
||||
I AM A POINTED LITTLE FILE
|
36
tests/htdocs/site7/foo.html
Normal file
36
tests/htdocs/site7/foo.html
Normal file
@ -0,0 +1,36 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>hashtag url test</title>
|
||||
<script>
|
||||
(function() {
|
||||
let lastHash = null;
|
||||
setInterval(function() {
|
||||
const hash = new URL(document.URL).hash;
|
||||
if (hash != lastHash && (hash == '#whee' || hash == '#boosh')) {
|
||||
lastHash = hash;
|
||||
|
||||
const httpRequest = new XMLHttpRequest();
|
||||
httpRequest.onreadystatechange = function() {
|
||||
if (httpRequest.readyState === XMLHttpRequest.DONE) {
|
||||
const e = document.createElement('p');
|
||||
e.textContent = 'loaded from ' + hash.substring(1) + ': ' + httpRequest.responseText;
|
||||
document.body.appendChild(e);
|
||||
}
|
||||
};
|
||||
|
||||
httpRequest.open('GET', hash.substring(1) + '.txt', true);
|
||||
httpRequest.send(null);
|
||||
}
|
||||
}, 1000);
|
||||
})();
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>hashtag url test</h1>
|
||||
<div><a href='#boosh'>#boosh</a></div>
|
||||
<div><a href='#ignored'>#ignored</a></div>
|
||||
<p>this page will ajax load ./whee.txt if it notices the url in the
|
||||
location bar has fragment "#whee", and ./boosh.txt if it notices
|
||||
"#boosh"</p>
|
||||
</body>
|
||||
</html>
|
10
tests/htdocs/site7/index.html
Normal file
10
tests/htdocs/site7/index.html
Normal file
@ -0,0 +1,10 @@
|
||||
<html>
|
||||
<head>
|
||||
<title>link to hashtag url test</title>
|
||||
<script>
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<a href="foo.html#whee">foo.html#whee</a>
|
||||
</body>
|
||||
</html>
|
1
tests/htdocs/site7/whee.txt
Normal file
1
tests/htdocs/site7/whee.txt
Normal file
@ -0,0 +1 @@
|
||||
I AM A POINTLESS LITTLE FILE
|
@ -453,3 +453,50 @@ def test_seed_redirect(httpd):
|
||||
|
||||
# check that scope has been updated properly
|
||||
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
|
||||
|
||||
def test_hashtags(httpd):
|
||||
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||
seed_url = 'http://localhost:%s/site7/' % httpd.server_port
|
||||
site = brozzler.Site(rr, {
|
||||
'seed': seed_url,
|
||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id
|
||||
|
||||
# the site should be brozzled fairly quickly
|
||||
start = time.time()
|
||||
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||
time.sleep(0.5)
|
||||
site.refresh()
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
# check that we the page we expected
|
||||
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||
assert len(pages) == 2
|
||||
assert pages[0].url == seed_url
|
||||
assert pages[0].hops_from_seed == 0
|
||||
assert pages[0].brozzle_count == 1
|
||||
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
|
||||
assert not pages[0].hashtags
|
||||
assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
|
||||
assert pages[1].hops_from_seed == 1
|
||||
assert pages[1].brozzle_count == 1
|
||||
assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
|
||||
|
||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||
# take a look at the captures table
|
||||
captures = rr.table('captures').filter({'test_id':test_id}).run()
|
||||
captures_by_url = {
|
||||
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||
assert seed_url in captures_by_url
|
||||
assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
||||
assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
|
||||
assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
|
||||
assert 'screenshot:%s' % seed_url in captures_by_url
|
||||
assert 'thumbnail:%s' % seed_url in captures_by_url
|
||||
assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
||||
assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
||||
|
||||
|
@ -591,3 +591,67 @@ def test_seed_page():
|
||||
page0.save()
|
||||
|
||||
assert frontier.seed_page(site.id) == page0
|
||||
|
||||
def test_hashtag_seed():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# no hash tag
|
||||
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
assert site.scope['surt'] == 'http://(org,example,)/'
|
||||
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 1
|
||||
assert pages[0].url == 'http://example.org/'
|
||||
assert not pages[0].hashtags
|
||||
|
||||
# yes hash tag
|
||||
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
assert site.scope['surt'] == 'http://(org,example,)/'
|
||||
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 1
|
||||
assert pages[0].url == 'http://example.org/'
|
||||
assert pages[0].hashtags == ['#hash',]
|
||||
|
||||
def test_hashtag_links():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||
brozzler.new_site(frontier, site)
|
||||
parent_page = frontier.seed_page(site.id)
|
||||
assert not parent_page.hashtags
|
||||
outlinks = [
|
||||
'http://example.org/#foo',
|
||||
'http://example.org/bar',
|
||||
'http://example.org/bar#baz',
|
||||
'http://example.org/bar#quux',
|
||||
'http://example.org/zuh#buh',
|
||||
]
|
||||
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||
|
||||
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||
assert len(pages) == 3
|
||||
assert pages[0].url == 'http://example.org/'
|
||||
assert sorted(pages[0].outlinks['accepted']) == [
|
||||
'http://example.org/', 'http://example.org/bar',
|
||||
'http://example.org/zuh']
|
||||
assert not pages[0].outlinks['blocked']
|
||||
assert not pages[0].outlinks['rejected']
|
||||
assert pages[0].hashtags == ['#foo',]
|
||||
assert pages[0].hops_from_seed == 0
|
||||
|
||||
assert pages[1].url == 'http://example.org/bar'
|
||||
assert sorted(pages[1].hashtags) == ['#baz','#quux']
|
||||
assert pages[1].priority == 36
|
||||
assert pages[1].hops_from_seed == 1
|
||||
|
||||
assert pages[2].url == 'http://example.org/zuh'
|
||||
assert pages[2].hashtags == ['#buh']
|
||||
assert pages[2].priority == 12
|
||||
|
||||
|
@ -7,6 +7,8 @@
|
||||
|
||||
cd $(dirname "${BASH_SOURCE[0]}")
|
||||
|
||||
vagrant up
|
||||
|
||||
echo service status:
|
||||
vagrant ssh -- 'status warcprox ;
|
||||
status Xvnc ;
|
||||
|
Loading…
x
Reference in New Issue
Block a user