mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
747 lines
26 KiB
Python
747 lines
26 KiB
Python
#!/usr/bin/env python
|
|
'''
|
|
test_frontier.py - fairly narrow tests of frontier management, requires
|
|
rethinkdb running on localhost
|
|
|
|
Copyright (C) 2017 Internet Archive
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
'''
|
|
|
|
import brozzler
|
|
import logging
|
|
import argparse
|
|
import doublethink
|
|
import time
|
|
import datetime
|
|
import uuid
|
|
import pytest
|
|
|
|
args = argparse.Namespace()
|
|
args.log_level = logging.INFO
|
|
brozzler.cli.configure_logging(args)
|
|
|
|
def test_rethinkdb_up():
|
|
'''Checks that rethinkdb is listening and looks sane.'''
|
|
rr = doublethink.Rethinker(db='rethinkdb') # built-in db
|
|
tbls = rr.table_list().run()
|
|
assert len(tbls) > 10
|
|
|
|
def test_basics():
|
|
rr = doublethink.Rethinker(db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
job_conf = {'seeds': [
|
|
{'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert job.id
|
|
assert job.starts_and_stops
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job == {
|
|
'id': job.id,
|
|
'conf': {
|
|
'seeds': [
|
|
{'url': 'http://example.com'},
|
|
{'url': 'https://example.org/'}
|
|
]
|
|
},
|
|
'status': 'ACTIVE',
|
|
'starts_and_stops': [
|
|
{
|
|
'start': job.starts_and_stops[0]['start'],
|
|
'stop': None
|
|
}
|
|
]
|
|
}
|
|
|
|
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
|
|
assert len(sites) == 2
|
|
assert sites[0].starts_and_stops[0]['start']
|
|
assert sites[1].starts_and_stops[0]['start']
|
|
assert sites[0] == {
|
|
'claimed': False,
|
|
'id': sites[0].id,
|
|
'job_id': job.id,
|
|
'last_claimed': brozzler.EPOCH_UTC,
|
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
|
'scope': {
|
|
'surt': 'http://(com,example,)/'
|
|
},
|
|
'seed': 'http://example.com',
|
|
'starts_and_stops': [
|
|
{
|
|
'start': sites[0].starts_and_stops[0]['start'],
|
|
'stop': None
|
|
}
|
|
],
|
|
'status': 'ACTIVE'
|
|
}
|
|
assert sites[1] == {
|
|
'claimed': False,
|
|
'id': sites[1].id,
|
|
'job_id': job.id,
|
|
'last_claimed': brozzler.EPOCH_UTC,
|
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
|
'scope': {
|
|
'surt': 'https://(org,example,)/',
|
|
},
|
|
'seed': 'https://example.org/',
|
|
'starts_and_stops': [
|
|
{
|
|
'start': sites[1].starts_and_stops[0]['start'],
|
|
'stop': None,
|
|
},
|
|
],
|
|
'status': 'ACTIVE',
|
|
}
|
|
|
|
pages = list(frontier.site_pages(sites[0].id))
|
|
assert len(pages) == 1
|
|
assert pages[0] == {
|
|
'brozzle_count': 0,
|
|
'claimed': False,
|
|
'hops_from_seed': 0,
|
|
'hops_off_surt': 0,
|
|
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
|
'job_id': job.id,
|
|
'needs_robots_check': True,
|
|
'priority': 1000,
|
|
'site_id': sites[0].id,
|
|
'url': 'http://example.com',
|
|
}
|
|
pages = list(frontier.site_pages(sites[1].id))
|
|
assert len(pages) == 1
|
|
assert pages[0] == {
|
|
'brozzle_count': 0,
|
|
'claimed': False,
|
|
'hops_from_seed': 0,
|
|
'hops_off_surt': 0,
|
|
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
|
'job_id': job.id,
|
|
'needs_robots_check': True,
|
|
'priority': 1000,
|
|
'site_id': sites[1].id,
|
|
'url': 'https://example.org/',
|
|
}
|
|
|
|
# test "brozzled" parameter of frontier.site_pages
|
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
|
|
pages[0].brozzle_count = 1
|
|
pages[0].save()
|
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
|
pages[0].brozzle_count = 32819
|
|
pages[0].save()
|
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
|
|
|
def test_resume_job():
|
|
'''
|
|
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
|
"finish" crawling a job. Doesn't actually crawl anything.
|
|
'''
|
|
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
|
rr = doublethink.Rethinker(db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
job_conf = {'seeds': [{'url': 'http://example.com/'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert len(list(frontier.job_sites(job.id))) == 1
|
|
site = list(frontier.job_sites(job.id))[0]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 1
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job.starts_and_stops[0]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 1
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job.starts_and_stops[0]['stop']
|
|
assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop']
|
|
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
frontier.resume_site(site)
|
|
job.refresh()
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop']
|
|
assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop']
|
|
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
# resuming a job == resuming all of its sites
|
|
frontier.resume_job(job)
|
|
site = list(frontier.job_sites(job.id))[0]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 3
|
|
assert job.starts_and_stops[2]['start']
|
|
assert job.starts_and_stops[2]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 3
|
|
assert site.starts_and_stops[2]['start']
|
|
assert site.starts_and_stops[2]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 3
|
|
assert job.starts_and_stops[2]['start']
|
|
assert job.starts_and_stops[2]['stop']
|
|
assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 3
|
|
assert site.starts_and_stops[2]['start']
|
|
assert site.starts_and_stops[2]['stop']
|
|
assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
def test_time_limit():
|
|
# XXX test not thoroughly adapted to change in time accounting, since
|
|
# starts_and_stops is no longer used to enforce time limits
|
|
|
|
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
site.refresh() # get it back from the db
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop']
|
|
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
frontier.resume_site(site)
|
|
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
# time limit not reached yet
|
|
frontier._enforce_time_limit(site)
|
|
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
site.time_limit = 0.1
|
|
site.claimed = True
|
|
site.save()
|
|
|
|
# time limit not reached yet
|
|
frontier._enforce_time_limit(site)
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
site.active_brozzling_time = 0.2 # this is why the time limit will be hit
|
|
|
|
frontier._enforce_time_limit(site)
|
|
assert site.status == 'FINISHED_TIME_LIMIT'
|
|
assert not site.claimed
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop']
|
|
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
def test_field_defaults():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
|
|
# page
|
|
brozzler.Page.table_ensure(rr)
|
|
page = brozzler.Page(rr, {'hops_from_seed': 3})
|
|
assert page.hops_from_seed == 3
|
|
assert page.id
|
|
assert page.brozzle_count == 0
|
|
page.save()
|
|
assert page.hops_from_seed == 3
|
|
assert page.id
|
|
assert page.brozzle_count == 0
|
|
|
|
qage = brozzler.Page.load(rr, page.id)
|
|
assert qage.hops_from_seed == 3
|
|
assert qage.id == page.id
|
|
assert qage.brozzle_count == 0
|
|
qage.save()
|
|
assert qage.hops_from_seed == 3
|
|
assert qage.id == page.id
|
|
assert qage.brozzle_count == 0
|
|
qage.refresh()
|
|
assert qage.hops_from_seed == 3
|
|
assert qage.id == page.id
|
|
assert qage.brozzle_count == 0
|
|
|
|
# site
|
|
brozzler.Site.table_ensure(rr)
|
|
site = brozzler.Site(rr, {'seed': 'http://example.com/'})
|
|
assert site.id is None
|
|
assert site.scope
|
|
assert site.scope['surt'] == 'http://(com,example,)/'
|
|
site.save()
|
|
assert site.id
|
|
assert site.scope
|
|
|
|
tite = brozzler.Site.load(rr, site.id)
|
|
assert tite.id == site.id
|
|
assert tite.scope == site.scope
|
|
tite.save()
|
|
assert tite.id == site.id
|
|
assert tite.scope == site.scope
|
|
tite.refresh()
|
|
assert tite.id == site.id
|
|
assert tite.scope == site.scope
|
|
|
|
# job
|
|
brozzler.Job.table_ensure(rr)
|
|
job = brozzler.Job(rr, {'status': 'WHUUUT'})
|
|
assert job.status == 'WHUUUT'
|
|
assert job.id is None
|
|
assert job.starts_and_stops
|
|
job.save()
|
|
assert job.status == 'WHUUUT'
|
|
assert job.id
|
|
assert job.starts_and_stops
|
|
|
|
kob = brozzler.Job.load(rr, job.id)
|
|
assert kob.status == 'WHUUUT'
|
|
assert kob.id
|
|
assert kob.starts_and_stops
|
|
kob.save()
|
|
assert kob.status == 'WHUUUT'
|
|
assert kob.id
|
|
assert kob.starts_and_stops
|
|
kob.refresh()
|
|
assert kob.status == 'WHUUUT'
|
|
assert kob.id
|
|
assert kob.starts_and_stops
|
|
|
|
def test_scope_and_schedule_outlinks():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/'})
|
|
parent_page = brozzler.Page(rr, {
|
|
'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
|
|
outlinks = [
|
|
'https://example.com/',
|
|
'https://example.com/foo',
|
|
'http://example.com/bar',
|
|
'HTtp://exAMPle.COm/bar',
|
|
'HTtp://exAMPle.COm/BAr',
|
|
'HTtp://exAMPle.COm/BAZZZZ',]
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
|
|
assert sorted(parent_page.outlinks['rejected']) == [
|
|
'https://example.com/', 'https://example.com/foo']
|
|
assert sorted(parent_page.outlinks['accepted']) == [
|
|
'http://example.com/BAZZZZ', 'http://example.com/BAr',
|
|
'http://example.com/bar']
|
|
assert parent_page.outlinks['blocked'] == []
|
|
|
|
pp = brozzler.Page.load(rr, parent_page.id)
|
|
assert pp == parent_page
|
|
|
|
for url in parent_page.outlinks['rejected']:
|
|
id = brozzler.Page.compute_id(site.id, url)
|
|
assert brozzler.Page.load(rr, id) is None
|
|
for url in parent_page.outlinks['accepted']:
|
|
id = brozzler.Page.compute_id(site.id, url)
|
|
assert brozzler.Page.load(rr, id)
|
|
|
|
def test_parent_url_scoping():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# scope rules that look at parent page url should consider both the
|
|
# original url and the redirect url, if any, of the parent page
|
|
site = brozzler.Site(rr, {
|
|
'seed': 'http://example.com/foo/',
|
|
'scope': {
|
|
'accepts': [{
|
|
'parent_url_regex': '^http://example.com/acceptme/.*$'}],
|
|
'blocks': [{
|
|
'parent_url_regex': '^http://example.com/blockme/.*$'}],
|
|
},
|
|
'remember_outlinks': True})
|
|
site.save()
|
|
|
|
# an outlink that would not otherwise be in scope
|
|
outlinks = ['https://some-random-url.com/']
|
|
|
|
# parent page does not match any parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/foo/spluh'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == outlinks
|
|
assert parent_page.outlinks['accepted'] == []
|
|
|
|
# parent page url matches accept parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/acceptme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == []
|
|
assert parent_page.outlinks['accepted'] == outlinks
|
|
|
|
# parent page redirect_url matches accept parent_url_regex
|
|
parent_page_c = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/toot/blah',
|
|
'redirect_url':'http://example.com/acceptme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == []
|
|
assert parent_page.outlinks['accepted'] == outlinks
|
|
|
|
# an outlink that would normally be in scope
|
|
outlinks = ['http://example.com/foo/whatever/']
|
|
|
|
# parent page does not match any parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/foo/spluh'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == []
|
|
assert parent_page.outlinks['accepted'] == outlinks
|
|
|
|
# parent page url matches block parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/blockme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == outlinks
|
|
assert parent_page.outlinks['accepted'] == []
|
|
|
|
# parent page redirect_url matches block parent_url_regex
|
|
parent_page_c = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/toot/blah',
|
|
'redirect_url':'http://example.com/blockme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == outlinks
|
|
assert parent_page.outlinks['accepted'] == []
|
|
|
|
def test_completed_page():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# redirect that changes scope surt
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/',
|
|
'claimed': True,
|
|
'brozzle_count': 0,
|
|
'hops_from_seed': 0,
|
|
'redirect_url':'http://example.com/b/', })
|
|
page.save()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
frontier.completed_page(site, page)
|
|
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
|
site.refresh()
|
|
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
page.refresh()
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
|
|
# redirect that doesn't change scope surt because destination is covered by
|
|
# the original surt
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/',
|
|
'claimed': True,
|
|
'brozzle_count': 0,
|
|
'hops_from_seed': 0,
|
|
'redirect_url':'http://example.com/a/x/', })
|
|
page.save()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
frontier.completed_page(site, page)
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
site.refresh()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
page.refresh()
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
|
|
# redirect that doesn't change scope surt because page is not the seed page
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/c/',
|
|
'claimed': True,
|
|
'brozzle_count': 0,
|
|
'hops_from_seed': 1,
|
|
'redirect_url':'http://example.com/d/', })
|
|
page.save()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
frontier.completed_page(site, page)
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
site.refresh()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
page.refresh()
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
|
|
def test_seed_page():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
|
|
assert frontier.seed_page(site.id) is None
|
|
|
|
page1 = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/b/',
|
|
'hops_from_seed': 1})
|
|
page1.save()
|
|
|
|
assert frontier.seed_page(site.id) is None
|
|
|
|
page0 = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/',
|
|
'hops_from_seed': 0})
|
|
page0.save()
|
|
|
|
assert frontier.seed_page(site.id) == page0
|
|
|
|
def test_hashtag_seed():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# no hash tag
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.scope['surt'] == 'http://(org,example,)/'
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
|
assert len(pages) == 1
|
|
assert pages[0].url == 'http://example.org/'
|
|
assert not pages[0].hashtags
|
|
|
|
# yes hash tag
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.scope['surt'] == 'http://(org,example,)/'
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
|
assert len(pages) == 1
|
|
assert pages[0].url == 'http://example.org/'
|
|
assert pages[0].hashtags == ['#hash',]
|
|
|
|
def test_hashtag_links():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
|
brozzler.new_site(frontier, site)
|
|
parent_page = frontier.seed_page(site.id)
|
|
assert not parent_page.hashtags
|
|
outlinks = [
|
|
'http://example.org/#foo',
|
|
'http://example.org/bar',
|
|
'http://example.org/bar#baz',
|
|
'http://example.org/bar#quux',
|
|
'http://example.org/zuh#buh',
|
|
]
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
|
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
|
assert len(pages) == 3
|
|
assert pages[0].url == 'http://example.org/'
|
|
assert sorted(pages[0].outlinks['accepted']) == [
|
|
'http://example.org/', 'http://example.org/bar',
|
|
'http://example.org/zuh']
|
|
assert not pages[0].outlinks['blocked']
|
|
assert not pages[0].outlinks['rejected']
|
|
assert pages[0].hashtags == ['#foo',]
|
|
assert pages[0].hops_from_seed == 0
|
|
|
|
assert pages[1].url == 'http://example.org/bar'
|
|
assert sorted(pages[1].hashtags) == ['#baz','#quux']
|
|
assert pages[1].priority == 36
|
|
assert pages[1].hops_from_seed == 1
|
|
|
|
assert pages[2].url == 'http://example.org/zuh'
|
|
assert pages[2].hashtags == ['#buh']
|
|
assert pages[2].priority == 12
|
|
|
|
def test_honor_stop_request():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# 1. test stop request on job
|
|
job_conf = {'seeds': [{'url': 'http://example.com'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert job.id
|
|
sites = list(frontier.job_sites(job.id))
|
|
assert len(sites) == 1
|
|
site = sites[0]
|
|
assert site.job_id == job.id
|
|
|
|
# does not raise exception
|
|
frontier.honor_stop_request(site)
|
|
|
|
# set job.stop_requested
|
|
job.stop_requested = datetime.datetime.utcnow().replace(
|
|
tzinfo=doublethink.UTC)
|
|
job.save()
|
|
with pytest.raises(brozzler.CrawlStopped):
|
|
frontier.honor_stop_request(site)
|
|
|
|
# 2. test stop request on site
|
|
job_conf = {'seeds': [{'url': 'http://example.com'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert job.id
|
|
sites = list(frontier.job_sites(job.id))
|
|
assert len(sites) == 1
|
|
site = sites[0]
|
|
assert site.job_id == job.id
|
|
|
|
# does not raise exception
|
|
frontier.honor_stop_request(site)
|
|
|
|
# set site.stop_requested
|
|
site.stop_requested = doublethink.utcnow()
|
|
site.save()
|
|
with pytest.raises(brozzler.CrawlStopped):
|
|
frontier.honor_stop_request(site)
|
|
|
|
def test_claim_site():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
rr.table('sites').delete().run() # clean slate
|
|
|
|
with pytest.raises(brozzler.NothingToClaim):
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
assert claimed_site.id == site.id
|
|
assert claimed_site.claimed
|
|
assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1)
|
|
with pytest.raises(brozzler.NothingToClaim):
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
|
|
# site last_claimed less than 1 hour ago still not to be reclaimed
|
|
claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55)
|
|
claimed_site.save()
|
|
with pytest.raises(brozzler.NothingToClaim):
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
|
|
# site last_claimed more than 1 hour ago can be reclaimed
|
|
site = claimed_site
|
|
claimed_site = None
|
|
site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
|
|
site.save()
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
assert claimed_site.id == site.id
|
|
|
|
# clean up
|
|
rr.table('sites').get(claimed_site.id).delete().run()
|
|
|