mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-05-03 15:15:04 -04:00
more automated tests of frontier stuff
This commit is contained in:
parent
9e1e002a71
commit
479f0f7e09
2 changed files with 112 additions and 2 deletions
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev201',
|
version='1.1b9.dev202',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -202,10 +202,12 @@ def test_field_defaults():
|
||||||
|
|
||||||
# site
|
# site
|
||||||
brozzler.Site.table_ensure(rr)
|
brozzler.Site.table_ensure(rr)
|
||||||
site = brozzler.Site(rr, {'enable_warcprox_features': True})
|
site = brozzler.Site(rr, {
|
||||||
|
'seed': 'http://example.com/', 'enable_warcprox_features': True})
|
||||||
assert site.enable_warcprox_features is True
|
assert site.enable_warcprox_features is True
|
||||||
assert site.id is None
|
assert site.id is None
|
||||||
assert site.scope
|
assert site.scope
|
||||||
|
assert site.scope['surt'] == 'http://(com,example,)/'
|
||||||
site.save()
|
site.save()
|
||||||
assert site.id
|
assert site.id
|
||||||
assert site.scope
|
assert site.scope
|
||||||
|
@ -247,3 +249,111 @@ def test_field_defaults():
|
||||||
assert kob.id
|
assert kob.id
|
||||||
assert kob.starts_and_stops
|
assert kob.starts_and_stops
|
||||||
|
|
||||||
|
def test_scope_and_scheduled_outlinks():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
site = brozzler.Site(rr, {
|
||||||
|
'seed':'http://example.com/', 'remember_outlinks':True})
|
||||||
|
parent_page = brozzler.Page(rr, {
|
||||||
|
'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
|
||||||
|
outlinks = [
|
||||||
|
'https://example.com/',
|
||||||
|
'https://example.com/foo',
|
||||||
|
'http://example.com/bar',
|
||||||
|
'HTtp://exAMPle.COm/bar',
|
||||||
|
'HTtp://exAMPle.COm/BAr',
|
||||||
|
'HTtp://exAMPle.COm/BAZZZZ',]
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
|
||||||
|
assert sorted(parent_page.outlinks['rejected']) == [
|
||||||
|
'https://example.com/', 'https://example.com/foo']
|
||||||
|
assert sorted(parent_page.outlinks['accepted']) == [
|
||||||
|
'http://example.com/BAZZZZ', 'http://example.com/BAr',
|
||||||
|
'http://example.com/bar']
|
||||||
|
assert parent_page.outlinks['blocked'] == []
|
||||||
|
|
||||||
|
pp = brozzler.Page.load(rr, parent_page.id)
|
||||||
|
assert pp == parent_page
|
||||||
|
|
||||||
|
for url in parent_page.outlinks['rejected']:
|
||||||
|
id = brozzler.Page.compute_id(site.id, url)
|
||||||
|
assert brozzler.Page.load(rr, id) is None
|
||||||
|
for url in parent_page.outlinks['accepted']:
|
||||||
|
id = brozzler.Page.compute_id(site.id, url)
|
||||||
|
assert brozzler.Page.load(rr, id)
|
||||||
|
|
||||||
|
def test_completed_page():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
# redirect that changes scope surt
|
||||||
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
||||||
|
site.save()
|
||||||
|
page = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/a/',
|
||||||
|
'claimed': True,
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'hops_from_seed': 0,
|
||||||
|
'redirect_url':'http://example.com/b/', })
|
||||||
|
page.save()
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||||
|
frontier.completed_page(site, page)
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
||||||
|
site.refresh()
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
||||||
|
assert page.brozzle_count == 1
|
||||||
|
assert page.claimed == False
|
||||||
|
page.refresh()
|
||||||
|
assert page.brozzle_count == 1
|
||||||
|
assert page.claimed == False
|
||||||
|
|
||||||
|
# redirect that doesn't change scope surt because destination is covered by
|
||||||
|
# the original surt
|
||||||
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
||||||
|
site.save()
|
||||||
|
page = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/a/',
|
||||||
|
'claimed': True,
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'hops_from_seed': 0,
|
||||||
|
'redirect_url':'http://example.com/a/x/', })
|
||||||
|
page.save()
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||||
|
frontier.completed_page(site, page)
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||||
|
site.refresh()
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||||
|
assert page.brozzle_count == 1
|
||||||
|
assert page.claimed == False
|
||||||
|
page.refresh()
|
||||||
|
assert page.brozzle_count == 1
|
||||||
|
assert page.claimed == False
|
||||||
|
|
||||||
|
# redirect that doesn't change scope surt because page is not the seed page
|
||||||
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
||||||
|
site.save()
|
||||||
|
page = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/c/',
|
||||||
|
'claimed': True,
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'hops_from_seed': 1,
|
||||||
|
'redirect_url':'http://example.com/d/', })
|
||||||
|
page.save()
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||||
|
frontier.completed_page(site, page)
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||||
|
site.refresh()
|
||||||
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
||||||
|
assert page.brozzle_count == 1
|
||||||
|
assert page.claimed == False
|
||||||
|
page.refresh()
|
||||||
|
assert page.brozzle_count == 1
|
||||||
|
assert page.claimed == False
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue