mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
new model for crawling hashtags, each one is no longer a top-level page
This commit is contained in:
parent
a836269e95
commit
3d47805ec1
@ -30,6 +30,7 @@ import datetime
|
|||||||
import base64
|
import base64
|
||||||
from brozzler.chrome import Chrome
|
from brozzler.chrome import Chrome
|
||||||
import socket
|
import socket
|
||||||
|
import urlcanon
|
||||||
|
|
||||||
class BrowsingException(Exception):
|
class BrowsingException(Exception):
|
||||||
pass
|
pass
|
||||||
@ -374,7 +375,7 @@ class Browser:
|
|||||||
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
||||||
user_agent=None, behavior_parameters=None,
|
user_agent=None, behavior_parameters=None,
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
username=None, password=None):
|
username=None, password=None, hashtags=None):
|
||||||
'''
|
'''
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
@ -434,12 +435,7 @@ class Browser:
|
|||||||
page_url, behavior_parameters)
|
page_url, behavior_parameters)
|
||||||
self.run_behavior(behavior_script, timeout=900)
|
self.run_behavior(behavior_script, timeout=900)
|
||||||
outlinks = self.extract_outlinks()
|
outlinks = self.extract_outlinks()
|
||||||
## for each hashtag not already visited:
|
self.visit_hashtags(page_url, hashtags, outlinks)
|
||||||
## navigate_to_hashtag (nothing to wait for so no timeout?)
|
|
||||||
## if on_screenshot;
|
|
||||||
## take screenshot (30 sec)
|
|
||||||
## run behavior (3 min)
|
|
||||||
## outlinks += retrieve_outlinks (60 sec)
|
|
||||||
final_page_url = self.url()
|
final_page_url = self.url()
|
||||||
return final_page_url, outlinks
|
return final_page_url, outlinks
|
||||||
except brozzler.ReachedLimit:
|
except brozzler.ReachedLimit:
|
||||||
@ -454,6 +450,29 @@ class Browser:
|
|||||||
self.websock_thread.on_request = None
|
self.websock_thread.on_request = None
|
||||||
self.websock_thread.on_response = None
|
self.websock_thread.on_response = None
|
||||||
|
|
||||||
|
def visit_hashtags(self, page_url, hashtags, outlinks):
|
||||||
|
_hashtags = set(hashtags or [])
|
||||||
|
for outlink in outlinks:
|
||||||
|
url = urlcanon.whatwg(outlink)
|
||||||
|
hashtag = (url.hash_sign + url.fragment).decode('utf-8')
|
||||||
|
urlcanon.canon.remove_fragment(url)
|
||||||
|
if hashtag and str(url) == page_url:
|
||||||
|
_hashtags.add(hashtag)
|
||||||
|
# could inject a script that listens for HashChangeEvent to figure
|
||||||
|
# out which hashtags were visited already and skip those
|
||||||
|
for hashtag in _hashtags:
|
||||||
|
# navigate_to_hashtag (nothing to wait for so no timeout?)
|
||||||
|
self.logger.debug('navigating to hashtag %s', hashtag)
|
||||||
|
url = urlcanon.whatwg(page_url)
|
||||||
|
url.hash_sign = b'#'
|
||||||
|
url.fragment = hashtag[1:].encode('utf-8')
|
||||||
|
self.send_to_chrome(
|
||||||
|
method='Page.navigate', params={'url': str(url)})
|
||||||
|
time.sleep(5) # um.. wait for idleness or something?
|
||||||
|
# take another screenshot?
|
||||||
|
# run behavior again with short timeout?
|
||||||
|
# retrieve outlinks again and append to list?
|
||||||
|
|
||||||
def navigate_to_page(
|
def navigate_to_page(
|
||||||
self, page_url, extra_headers=None, user_agent=None, timeout=300):
|
self, page_url, extra_headers=None, user_agent=None, timeout=300):
|
||||||
headers = extra_headers or {}
|
headers = extra_headers or {}
|
||||||
|
@ -266,8 +266,11 @@ class RethinkDbFrontier:
|
|||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
url_for_scoping = urlcanon.semantic(url)
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
url_for_crawling = urlcanon.whatwg(url)
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
|
hashtag = (url_for_crawling.hash_sign
|
||||||
|
+ url_for_crawling.fragment).decode('utf-8')
|
||||||
|
urlcanon.canon.remove_fragment(url_for_crawling)
|
||||||
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
|
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||||
if not url_for_scoping.surt().startswith(
|
if not url_for_scoping.surt().startswith(
|
||||||
site.scope["surt"].encode("utf-8")):
|
site.scope["surt"].encode("utf-8")):
|
||||||
hops_off_surt = parent_page.hops_off_surt + 1
|
hops_off_surt = parent_page.hops_off_surt + 1
|
||||||
@ -283,9 +286,17 @@ class RethinkDbFrontier:
|
|||||||
self.rr, new_child_page.id)
|
self.rr, new_child_page.id)
|
||||||
if existing_child_page:
|
if existing_child_page:
|
||||||
existing_child_page.priority += new_child_page.priority
|
existing_child_page.priority += new_child_page.priority
|
||||||
|
if hashtag and existing_child_page.hashtags:
|
||||||
|
hashtags = set(existing_child_page.hashtags)
|
||||||
|
hashtags.add(hashtag)
|
||||||
|
existing_child_page.hashtags = list(hashtags)
|
||||||
|
elif hashtag:
|
||||||
|
existing_child_page.hashtags = [hashtag]
|
||||||
existing_child_page.save()
|
existing_child_page.save()
|
||||||
counts["updated"] += 1
|
counts["updated"] += 1
|
||||||
else:
|
else:
|
||||||
|
if hashtag:
|
||||||
|
new_child_page.hashtags = [hashtag,]
|
||||||
new_child_page.save()
|
new_child_page.save()
|
||||||
counts["added"] += 1
|
counts["added"] += 1
|
||||||
decisions["accepted"].add(str(url_for_crawling))
|
decisions["accepted"].add(str(url_for_crawling))
|
||||||
|
@ -27,6 +27,7 @@ import doublethink
|
|||||||
import os
|
import os
|
||||||
import cerberus
|
import cerberus
|
||||||
import urllib
|
import urllib
|
||||||
|
import urlcanon
|
||||||
|
|
||||||
def load_schema():
|
def load_schema():
|
||||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||||
@ -94,22 +95,24 @@ def new_job(frontier, job_conf):
|
|||||||
def new_site(frontier, site):
|
def new_site(frontier, site):
|
||||||
site.id = str(uuid.uuid4())
|
site.id = str(uuid.uuid4())
|
||||||
logging.info("new site {}".format(site))
|
logging.info("new site {}".format(site))
|
||||||
|
# insert the Page into the database before the Site, to avoid situation
|
||||||
|
# where a brozzler worker immediately claims the site, finds no pages
|
||||||
|
# to crawl, and decides the site is finished
|
||||||
try:
|
try:
|
||||||
# insert the Page into the database before the Site, to avoid situation
|
url = urlcanon.parse_url(site.seed)
|
||||||
# where a brozzler worker immediately claims the site, finds no pages
|
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||||
# to crawl, and decides the site is finished
|
urlcanon.canon.remove_fragment(url)
|
||||||
try:
|
page = brozzler.Page(frontier.rr, {
|
||||||
page = brozzler.Page(frontier.rr, {
|
"url": str(url), "site_id": site.get("id"),
|
||||||
"url": site.seed, "site_id": site.get("id"),
|
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
"priority": 1000, "needs_robots_check": True})
|
||||||
"priority": 1000, "needs_robots_check": True})
|
if hashtag:
|
||||||
page.save()
|
page.hashtags = [hashtag,]
|
||||||
logging.info("queued page %s", page)
|
page.save()
|
||||||
finally:
|
logging.info("queued page %s", page)
|
||||||
# finally block because we want to insert the Site no matter what
|
finally:
|
||||||
site.save()
|
# finally block because we want to insert the Site no matter what
|
||||||
except brozzler.ReachedLimit as e:
|
site.save()
|
||||||
frontier.reached_limit(site, e)
|
|
||||||
|
|
||||||
class Job(doublethink.Document):
|
class Job(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
@ -364,7 +364,8 @@ class BrozzlerWorker:
|
|||||||
behavior_parameters=site.get('behavior_parameters'),
|
behavior_parameters=site.get('behavior_parameters'),
|
||||||
username=site.get('username'), password=site.get('password'),
|
username=site.get('username'), password=site.get('password'),
|
||||||
user_agent=site.get('user_agent'),
|
user_agent=site.get('user_agent'),
|
||||||
on_screenshot=_on_screenshot, on_response=_on_response)
|
on_screenshot=_on_screenshot, on_response=_on_response,
|
||||||
|
hashtags=page.hashtags)
|
||||||
if final_page_url != page.url:
|
if final_page_url != page.url:
|
||||||
page.note_redirect(final_page_url)
|
page.note_redirect(final_page_url)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b10.dev223',
|
version='1.1b10.dev224',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
1
tests/htdocs/site7/boosh.txt
Normal file
1
tests/htdocs/site7/boosh.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
I AM A POINTED LITTLE FILE
|
36
tests/htdocs/site7/foo.html
Normal file
36
tests/htdocs/site7/foo.html
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>hashtag url test</title>
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
let lastHash = null;
|
||||||
|
setInterval(function() {
|
||||||
|
const hash = new URL(document.URL).hash;
|
||||||
|
if (hash != lastHash && (hash == '#whee' || hash == '#boosh')) {
|
||||||
|
lastHash = hash;
|
||||||
|
|
||||||
|
const httpRequest = new XMLHttpRequest();
|
||||||
|
httpRequest.onreadystatechange = function() {
|
||||||
|
if (httpRequest.readyState === XMLHttpRequest.DONE) {
|
||||||
|
const e = document.createElement('p');
|
||||||
|
e.textContent = 'loaded from ' + hash.substring(1) + ': ' + httpRequest.responseText;
|
||||||
|
document.body.appendChild(e);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
httpRequest.open('GET', hash.substring(1) + '.txt', true);
|
||||||
|
httpRequest.send(null);
|
||||||
|
}
|
||||||
|
}, 1000);
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>hashtag url test</h1>
|
||||||
|
<div><a href='#boosh'>#boosh</a></div>
|
||||||
|
<div><a href='#ignored'>#ignored</a></div>
|
||||||
|
<p>this page will ajax load ./whee.txt if it notices the url in the
|
||||||
|
location bar has fragment "#whee", and ./boosh.txt if it notices
|
||||||
|
"#boosh"</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
10
tests/htdocs/site7/index.html
Normal file
10
tests/htdocs/site7/index.html
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>link to hashtag url test</title>
|
||||||
|
<script>
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<a href="foo.html#whee">foo.html#whee</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
1
tests/htdocs/site7/whee.txt
Normal file
1
tests/htdocs/site7/whee.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
I AM A POINTLESS LITTLE FILE
|
@ -453,3 +453,50 @@ def test_seed_redirect(httpd):
|
|||||||
|
|
||||||
# check that scope has been updated properly
|
# check that scope has been updated properly
|
||||||
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
|
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
|
||||||
|
|
||||||
|
def test_hashtags(httpd):
|
||||||
|
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
|
seed_url = 'http://localhost:%s/site7/' % httpd.server_port
|
||||||
|
site = brozzler.Site(rr, {
|
||||||
|
'seed': seed_url,
|
||||||
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
assert site.id
|
||||||
|
|
||||||
|
# the site should be brozzled fairly quickly
|
||||||
|
start = time.time()
|
||||||
|
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||||
|
time.sleep(0.5)
|
||||||
|
site.refresh()
|
||||||
|
assert site.status == 'FINISHED'
|
||||||
|
|
||||||
|
# check that we the page we expected
|
||||||
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||||
|
assert len(pages) == 2
|
||||||
|
assert pages[0].url == seed_url
|
||||||
|
assert pages[0].hops_from_seed == 0
|
||||||
|
assert pages[0].brozzle_count == 1
|
||||||
|
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
|
||||||
|
assert not pages[0].hashtags
|
||||||
|
assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
|
||||||
|
assert pages[1].hops_from_seed == 1
|
||||||
|
assert pages[1].brozzle_count == 1
|
||||||
|
assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
|
||||||
|
|
||||||
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
|
# take a look at the captures table
|
||||||
|
captures = rr.table('captures').filter({'test_id':test_id}).run()
|
||||||
|
captures_by_url = {
|
||||||
|
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
|
assert seed_url in captures_by_url
|
||||||
|
assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
||||||
|
assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
|
||||||
|
assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
|
||||||
|
assert 'screenshot:%s' % seed_url in captures_by_url
|
||||||
|
assert 'thumbnail:%s' % seed_url in captures_by_url
|
||||||
|
assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
||||||
|
assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
||||||
|
|
||||||
|
@ -591,3 +591,67 @@ def test_seed_page():
|
|||||||
page0.save()
|
page0.save()
|
||||||
|
|
||||||
assert frontier.seed_page(site.id) == page0
|
assert frontier.seed_page(site.id) == page0
|
||||||
|
|
||||||
|
def test_hashtag_seed():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
# no hash tag
|
||||||
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
|
assert site.scope['surt'] == 'http://(org,example,)/'
|
||||||
|
|
||||||
|
pages = list(frontier.site_pages(site.id))
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0].url == 'http://example.org/'
|
||||||
|
assert not pages[0].hashtags
|
||||||
|
|
||||||
|
# yes hash tag
|
||||||
|
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
|
||||||
|
assert site.scope['surt'] == 'http://(org,example,)/'
|
||||||
|
|
||||||
|
pages = list(frontier.site_pages(site.id))
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0].url == 'http://example.org/'
|
||||||
|
assert pages[0].hashtags == ['#hash',]
|
||||||
|
|
||||||
|
def test_hashtag_links():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
parent_page = frontier.seed_page(site.id)
|
||||||
|
assert not parent_page.hashtags
|
||||||
|
outlinks = [
|
||||||
|
'http://example.org/#foo',
|
||||||
|
'http://example.org/bar',
|
||||||
|
'http://example.org/bar#baz',
|
||||||
|
'http://example.org/bar#quux',
|
||||||
|
'http://example.org/zuh#buh',
|
||||||
|
]
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
|
||||||
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
||||||
|
assert len(pages) == 3
|
||||||
|
assert pages[0].url == 'http://example.org/'
|
||||||
|
assert sorted(pages[0].outlinks['accepted']) == [
|
||||||
|
'http://example.org/', 'http://example.org/bar',
|
||||||
|
'http://example.org/zuh']
|
||||||
|
assert not pages[0].outlinks['blocked']
|
||||||
|
assert not pages[0].outlinks['rejected']
|
||||||
|
assert pages[0].hashtags == ['#foo',]
|
||||||
|
assert pages[0].hops_from_seed == 0
|
||||||
|
|
||||||
|
assert pages[1].url == 'http://example.org/bar'
|
||||||
|
assert sorted(pages[1].hashtags) == ['#baz','#quux']
|
||||||
|
assert pages[1].priority == 36
|
||||||
|
assert pages[1].hops_from_seed == 1
|
||||||
|
|
||||||
|
assert pages[2].url == 'http://example.org/zuh'
|
||||||
|
assert pages[2].hashtags == ['#buh']
|
||||||
|
assert pages[2].priority == 12
|
||||||
|
|
||||||
|
@ -7,6 +7,8 @@
|
|||||||
|
|
||||||
cd $(dirname "${BASH_SOURCE[0]}")
|
cd $(dirname "${BASH_SOURCE[0]}")
|
||||||
|
|
||||||
|
vagrant up
|
||||||
|
|
||||||
echo service status:
|
echo service status:
|
||||||
vagrant ssh -- 'status warcprox ;
|
vagrant ssh -- 'status warcprox ;
|
||||||
status Xvnc ;
|
status Xvnc ;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user