mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00

--warcprox-auto distribute assigned sites evenly When running with --warcprox-auto, choose the instance of warcprox with the least number of assigned sites, instead of the lowest load in the service registry. In practice we often start brozzling a whole bunch of sites at approximately the same time, and because it takes time for that to affect the "load" reported by warcprox instances, sites end up being distributed very unevenly.
933 lines
33 KiB
Python
933 lines
33 KiB
Python
#!/usr/bin/env python
|
|
'''
|
|
test_frontier.py - fairly narrow tests of frontier management, requires
|
|
rethinkdb running on localhost
|
|
|
|
Copyright (C) 2017 Internet Archive
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
'''
|
|
|
|
import argparse
|
|
import datetime
|
|
import logging
|
|
|
|
import doublethink
|
|
import pytest
|
|
|
|
import brozzler
|
|
|
|
args = argparse.Namespace()
|
|
args.log_level = logging.INFO
|
|
brozzler.cli.configure_logging(args)
|
|
|
|
def test_rethinkdb_up():
|
|
'''Checks that rethinkdb is listening and looks sane.'''
|
|
rr = doublethink.Rethinker(db='rethinkdb') # built-in db
|
|
tbls = rr.table_list().run()
|
|
assert len(tbls) > 10
|
|
|
|
def test_basics():
|
|
rr = doublethink.Rethinker(db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
job_conf = {'seeds': [
|
|
{'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert job.id
|
|
assert job.starts_and_stops
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job == {
|
|
'id': job.id,
|
|
'conf': {
|
|
'seeds': [
|
|
{'url': 'http://example.com'},
|
|
{'url': 'https://example.org/'}
|
|
]
|
|
},
|
|
'status': 'ACTIVE',
|
|
'starts_and_stops': [
|
|
{
|
|
'start': job.starts_and_stops[0]['start'],
|
|
'stop': None
|
|
}
|
|
]
|
|
}
|
|
|
|
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
|
|
assert len(sites) == 2
|
|
assert sites[0].starts_and_stops[0]['start']
|
|
assert sites[1].starts_and_stops[0]['start']
|
|
assert sites[0] == {
|
|
'claimed': False,
|
|
'id': sites[0].id,
|
|
'job_id': job.id,
|
|
'last_claimed': brozzler.EPOCH_UTC,
|
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
|
'scope': {
|
|
'surt': 'http://(com,example,)/'
|
|
},
|
|
'seed': 'http://example.com',
|
|
'starts_and_stops': [
|
|
{
|
|
'start': sites[0].starts_and_stops[0]['start'],
|
|
'stop': None
|
|
}
|
|
],
|
|
'status': 'ACTIVE'
|
|
}
|
|
assert sites[1] == {
|
|
'claimed': False,
|
|
'id': sites[1].id,
|
|
'job_id': job.id,
|
|
'last_claimed': brozzler.EPOCH_UTC,
|
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
|
'scope': {
|
|
'surt': 'https://(org,example,)/',
|
|
},
|
|
'seed': 'https://example.org/',
|
|
'starts_and_stops': [
|
|
{
|
|
'start': sites[1].starts_and_stops[0]['start'],
|
|
'stop': None,
|
|
},
|
|
],
|
|
'status': 'ACTIVE',
|
|
}
|
|
|
|
pages = list(frontier.site_pages(sites[0].id))
|
|
assert len(pages) == 1
|
|
assert pages[0] == {
|
|
'brozzle_count': 0,
|
|
'claimed': False,
|
|
'hops_from_seed': 0,
|
|
'hops_off_surt': 0,
|
|
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
|
'job_id': job.id,
|
|
'needs_robots_check': True,
|
|
'priority': 1000,
|
|
'site_id': sites[0].id,
|
|
'url': 'http://example.com',
|
|
}
|
|
pages = list(frontier.site_pages(sites[1].id))
|
|
assert len(pages) == 1
|
|
assert pages[0] == {
|
|
'brozzle_count': 0,
|
|
'claimed': False,
|
|
'hops_from_seed': 0,
|
|
'hops_off_surt': 0,
|
|
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
|
'job_id': job.id,
|
|
'needs_robots_check': True,
|
|
'priority': 1000,
|
|
'site_id': sites[1].id,
|
|
'url': 'https://example.org/',
|
|
}
|
|
|
|
# test "brozzled" parameter of frontier.site_pages
|
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
|
|
pages[0].brozzle_count = 1
|
|
pages[0].save()
|
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
|
pages[0].brozzle_count = 32819
|
|
pages[0].save()
|
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
|
|
|
def test_resume_job():
|
|
'''
|
|
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
|
"finish" crawling a job. Doesn't actually crawl anything.
|
|
'''
|
|
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
|
rr = doublethink.Rethinker(db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
job_conf = {'seeds': [{'url': 'http://example.com/'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert len(list(frontier.job_sites(job.id))) == 1
|
|
site = list(frontier.job_sites(job.id))[0]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 1
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job.starts_and_stops[0]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 1
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job.starts_and_stops[0]['stop']
|
|
assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop']
|
|
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
frontier.resume_site(site)
|
|
job.refresh()
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop']
|
|
assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[1]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop']
|
|
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']
|
|
|
|
# resuming a job == resuming all of its sites
|
|
frontier.resume_job(job)
|
|
site = list(frontier.job_sites(job.id))[0]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 3
|
|
assert job.starts_and_stops[2]['start']
|
|
assert job.starts_and_stops[2]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 3
|
|
assert site.starts_and_stops[2]['start']
|
|
assert site.starts_and_stops[2]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert len(job.starts_and_stops) == 3
|
|
assert job.starts_and_stops[2]['start']
|
|
assert job.starts_and_stops[2]['stop']
|
|
assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[2]['start']
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 3
|
|
assert site.starts_and_stops[2]['start']
|
|
assert site.starts_and_stops[2]['stop']
|
|
assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[2]['start']
|
|
|
|
frontier.resume_job(job)
|
|
site = list(frontier.job_sites(job.id))[0]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert len(job.starts_and_stops) == 4
|
|
assert job.starts_and_stops[3]['start']
|
|
assert job.starts_and_stops[3]['stop'] is None
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 4
|
|
assert site.starts_and_stops[3]['start']
|
|
assert site.starts_and_stops[3]['stop'] is None
|
|
|
|
# simulate a job stop request
|
|
job_conf = {'seeds': [{'url': 'http://example.com/'}, {'url': 'http://example_2.com/'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert len(list(frontier.job_sites(job.id))) == 2
|
|
site1 = list(frontier.job_sites(job.id))[0]
|
|
site2 = list(frontier.job_sites(job.id))[1]
|
|
|
|
job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
|
job.save()
|
|
|
|
# should raise a CrawlStopped
|
|
with pytest.raises(brozzler.CrawlStopped):
|
|
frontier.honor_stop_request(site1)
|
|
|
|
frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
|
|
frontier.finished(site2, 'FINISHED_STOP_REQUESTED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'FINISHED'
|
|
assert job.stop_requested
|
|
assert len(job.starts_and_stops) == 1
|
|
assert job.starts_and_stops[0]['start']
|
|
assert job.starts_and_stops[0]['stop']
|
|
assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
|
|
assert site1.status == 'FINISHED_STOP_REQUESTED'
|
|
assert site2.status == 'FINISHED_STOP_REQUESTED'
|
|
assert len(site1.starts_and_stops) == 1
|
|
assert len(site2.starts_and_stops) == 1
|
|
assert site1.starts_and_stops[0]['start']
|
|
assert site1.starts_and_stops[0]['stop']
|
|
assert site1.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
assert site2.starts_and_stops[0]['start']
|
|
assert site2.starts_and_stops[0]['stop']
|
|
assert site2.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
# simulate job resume after a stop request
|
|
frontier.resume_job(job)
|
|
site1 = list(frontier.job_sites(job.id))[0]
|
|
site2 = list(frontier.job_sites(job.id))[1]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert job.stop_requested is None
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop'] is None
|
|
assert site1.status == 'ACTIVE'
|
|
assert len(site1.starts_and_stops) == 2
|
|
assert site1.starts_and_stops[1]['start']
|
|
assert site1.starts_and_stops[1]['stop'] is None
|
|
assert site2.status == 'ACTIVE'
|
|
assert len(site2.starts_and_stops) == 2
|
|
assert site2.starts_and_stops[1]['start']
|
|
assert site2.starts_and_stops[1]['stop'] is None
|
|
|
|
# simulate a site stop request
|
|
site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
|
site1.save()
|
|
|
|
# should not raise a CrawlStopped
|
|
frontier.honor_stop_request(site2)
|
|
|
|
frontier.finished(site1, 'FINISHED_STOP_REQUESTED')
|
|
job.refresh()
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert job.stop_requested is None
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop'] is None
|
|
assert site1.status == 'FINISHED_STOP_REQUESTED'
|
|
assert len(site1.starts_and_stops) == 2
|
|
assert site1.starts_and_stops[1]['start']
|
|
assert site1.starts_and_stops[1]['stop']
|
|
assert site1.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start']
|
|
assert site2.status == 'ACTIVE'
|
|
assert len(site2.starts_and_stops) == 2
|
|
assert site2.starts_and_stops[1]['start']
|
|
assert site2.starts_and_stops[1]['stop'] is None
|
|
|
|
# simulate site resume after a stop request
|
|
frontier.resume_site(site1)
|
|
site1 = list(frontier.job_sites(job.id))[0]
|
|
site2 = list(frontier.job_sites(job.id))[1]
|
|
|
|
assert job.status == 'ACTIVE'
|
|
assert job.stop_requested is None
|
|
assert len(job.starts_and_stops) == 2
|
|
assert job.starts_and_stops[1]['start']
|
|
assert job.starts_and_stops[1]['stop'] is None
|
|
assert site1.status == 'ACTIVE'
|
|
assert site1.stop_requested is None
|
|
assert len(site1.starts_and_stops) == 3
|
|
assert site1.starts_and_stops[2]['start']
|
|
assert site1.starts_and_stops[2]['stop'] is None
|
|
assert site2.status == 'ACTIVE'
|
|
assert len(site2.starts_and_stops) == 2
|
|
assert site2.starts_and_stops[1]['start']
|
|
assert site2.starts_and_stops[1]['stop'] is None
|
|
|
|
def test_time_limit():
|
|
# XXX test not thoroughly adapted to change in time accounting, since
|
|
# starts_and_stops is no longer used to enforce time limits
|
|
|
|
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
site.refresh() # get it back from the db
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop'] is None
|
|
|
|
frontier.finished(site, 'FINISHED')
|
|
|
|
assert site.status == 'FINISHED'
|
|
assert len(site.starts_and_stops) == 1
|
|
assert site.starts_and_stops[0]['start']
|
|
assert site.starts_and_stops[0]['stop']
|
|
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
frontier.resume_site(site)
|
|
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
# time limit not reached yet
|
|
frontier._enforce_time_limit(site)
|
|
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
site.time_limit = 0.1
|
|
site.claimed = True
|
|
site.save()
|
|
|
|
# time limit not reached yet
|
|
frontier._enforce_time_limit(site)
|
|
assert site.status == 'ACTIVE'
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop'] is None
|
|
|
|
site.active_brozzling_time = 0.2 # this is why the time limit will be hit
|
|
|
|
frontier._enforce_time_limit(site)
|
|
assert site.status == 'FINISHED_TIME_LIMIT'
|
|
assert not site.claimed
|
|
assert len(site.starts_and_stops) == 2
|
|
assert site.starts_and_stops[1]['start']
|
|
assert site.starts_and_stops[1]['stop']
|
|
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
|
|
|
def test_field_defaults():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
|
|
# page
|
|
brozzler.Page.table_ensure(rr)
|
|
page = brozzler.Page(rr, {'hops_from_seed': 3})
|
|
assert page.hops_from_seed == 3
|
|
assert page.id
|
|
assert page.brozzle_count == 0
|
|
page.save()
|
|
assert page.hops_from_seed == 3
|
|
assert page.id
|
|
assert page.brozzle_count == 0
|
|
|
|
qage = brozzler.Page.load(rr, page.id)
|
|
assert qage.hops_from_seed == 3
|
|
assert qage.id == page.id
|
|
assert qage.brozzle_count == 0
|
|
qage.save()
|
|
assert qage.hops_from_seed == 3
|
|
assert qage.id == page.id
|
|
assert qage.brozzle_count == 0
|
|
qage.refresh()
|
|
assert qage.hops_from_seed == 3
|
|
assert qage.id == page.id
|
|
assert qage.brozzle_count == 0
|
|
|
|
# site
|
|
brozzler.Site.table_ensure(rr)
|
|
site = brozzler.Site(rr, {'seed': 'http://example.com/'})
|
|
assert site.id is None
|
|
assert site.scope
|
|
assert site.scope['surt'] == 'http://(com,example,)/'
|
|
site.save()
|
|
assert site.id
|
|
assert site.scope
|
|
|
|
tite = brozzler.Site.load(rr, site.id)
|
|
assert tite.id == site.id
|
|
assert tite.scope == site.scope
|
|
tite.save()
|
|
assert tite.id == site.id
|
|
assert tite.scope == site.scope
|
|
tite.refresh()
|
|
assert tite.id == site.id
|
|
assert tite.scope == site.scope
|
|
|
|
# job
|
|
brozzler.Job.table_ensure(rr)
|
|
job = brozzler.Job(rr, {'status': 'WHUUUT'})
|
|
assert job.status == 'WHUUUT'
|
|
assert job.id is None
|
|
assert job.starts_and_stops
|
|
job.save()
|
|
assert job.status == 'WHUUUT'
|
|
assert job.id
|
|
assert job.starts_and_stops
|
|
|
|
kob = brozzler.Job.load(rr, job.id)
|
|
assert kob.status == 'WHUUUT'
|
|
assert kob.id
|
|
assert kob.starts_and_stops
|
|
kob.save()
|
|
assert kob.status == 'WHUUUT'
|
|
assert kob.id
|
|
assert kob.starts_and_stops
|
|
kob.refresh()
|
|
assert kob.status == 'WHUUUT'
|
|
assert kob.id
|
|
assert kob.starts_and_stops
|
|
|
|
def test_scope_and_schedule_outlinks():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/'})
|
|
parent_page = brozzler.Page(rr, {
|
|
'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
|
|
outlinks = [
|
|
'https://example.com/',
|
|
'https://example.com/foo',
|
|
'http://example.com/bar',
|
|
'HTtp://exAMPle.COm/bar',
|
|
'HTtp://exAMPle.COm/BAr',
|
|
'HTtp://exAMPle.COm/BAZZZZ',]
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
|
|
assert sorted(parent_page.outlinks['rejected']) == [
|
|
'https://example.com/', 'https://example.com/foo']
|
|
assert sorted(parent_page.outlinks['accepted']) == [
|
|
'http://example.com/BAZZZZ', 'http://example.com/BAr',
|
|
'http://example.com/bar']
|
|
assert parent_page.outlinks['blocked'] == []
|
|
|
|
pp = brozzler.Page.load(rr, parent_page.id)
|
|
assert pp == parent_page
|
|
|
|
for url in parent_page.outlinks['rejected']:
|
|
id = brozzler.Page.compute_id(site.id, url)
|
|
assert brozzler.Page.load(rr, id) is None
|
|
for url in parent_page.outlinks['accepted']:
|
|
id = brozzler.Page.compute_id(site.id, url)
|
|
assert brozzler.Page.load(rr, id)
|
|
|
|
def test_parent_url_scoping():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# scope rules that look at parent page url should consider both the
|
|
# original url and the redirect url, if any, of the parent page
|
|
site = brozzler.Site(rr, {
|
|
'seed': 'http://example.com/foo/',
|
|
'scope': {
|
|
'accepts': [{
|
|
'parent_url_regex': '^http://example.com/acceptme/.*$'}],
|
|
'blocks': [{
|
|
'parent_url_regex': '^http://example.com/blockme/.*$'}],
|
|
},
|
|
'remember_outlinks': True})
|
|
site.save()
|
|
|
|
# an outlink that would not otherwise be in scope
|
|
outlinks = ['https://some-random-url.com/']
|
|
|
|
# parent page does not match any parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/foo/spluh'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == outlinks
|
|
assert parent_page.outlinks['accepted'] == []
|
|
|
|
# parent page url matches accept parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/acceptme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == []
|
|
assert parent_page.outlinks['accepted'] == outlinks
|
|
|
|
# parent page redirect_url matches accept parent_url_regex
|
|
parent_page_c = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/toot/blah',
|
|
'redirect_url':'http://example.com/acceptme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == []
|
|
assert parent_page.outlinks['accepted'] == outlinks
|
|
|
|
# an outlink that would normally be in scope
|
|
outlinks = ['http://example.com/foo/whatever/']
|
|
|
|
# parent page does not match any parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/foo/spluh'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == []
|
|
assert parent_page.outlinks['accepted'] == outlinks
|
|
|
|
# parent page url matches block parent_url_regex
|
|
parent_page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/blockme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == outlinks
|
|
assert parent_page.outlinks['accepted'] == []
|
|
|
|
# parent page redirect_url matches block parent_url_regex
|
|
parent_page_c = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/toot/blah',
|
|
'redirect_url':'http://example.com/blockme/futz'})
|
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
|
brozzler.is_permitted_by_robots = lambda *args: True
|
|
try:
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
finally:
|
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
|
assert parent_page.outlinks['rejected'] == outlinks
|
|
assert parent_page.outlinks['accepted'] == []
|
|
|
|
def test_completed_page():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# redirect that changes scope surt
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/',
|
|
'claimed': True,
|
|
'brozzle_count': 0,
|
|
'hops_from_seed': 0,
|
|
'redirect_url':'http://example.com/b/', })
|
|
page.save()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
frontier.completed_page(site, page)
|
|
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
|
site.refresh()
|
|
assert site.scope == {'surt': 'http://(com,example,)/b/'}
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
page.refresh()
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
|
|
# redirect that doesn't change scope surt because destination is covered by
|
|
# the original surt
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/',
|
|
'claimed': True,
|
|
'brozzle_count': 0,
|
|
'hops_from_seed': 0,
|
|
'redirect_url':'http://example.com/a/x/', })
|
|
page.save()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
frontier.completed_page(site, page)
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
site.refresh()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
page.refresh()
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
|
|
# redirect that doesn't change scope surt because page is not the seed page
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
page = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/c/',
|
|
'claimed': True,
|
|
'brozzle_count': 0,
|
|
'hops_from_seed': 1,
|
|
'redirect_url':'http://example.com/d/', })
|
|
page.save()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
frontier.completed_page(site, page)
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
site.refresh()
|
|
assert site.scope == {'surt': 'http://(com,example,)/a/'}
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
page.refresh()
|
|
assert page.brozzle_count == 1
|
|
assert page.claimed == False
|
|
|
|
def test_seed_page():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
site = brozzler.Site(rr, {'seed':'http://example.com/a/'})
|
|
site.save()
|
|
|
|
assert frontier.seed_page(site.id) is None
|
|
|
|
page1 = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/b/',
|
|
'hops_from_seed': 1})
|
|
page1.save()
|
|
|
|
assert frontier.seed_page(site.id) is None
|
|
|
|
page0 = brozzler.Page(rr, {
|
|
'site_id': site.id,
|
|
'url': 'http://example.com/a/',
|
|
'hops_from_seed': 0})
|
|
page0.save()
|
|
|
|
assert frontier.seed_page(site.id) == page0
|
|
|
|
def test_hashtag_seed():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# no hash tag
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.scope['surt'] == 'http://(org,example,)/'
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
|
assert len(pages) == 1
|
|
assert pages[0].url == 'http://example.org/'
|
|
assert not pages[0].hashtags
|
|
|
|
# yes hash tag
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
assert site.scope['surt'] == 'http://(org,example,)/'
|
|
|
|
pages = list(frontier.site_pages(site.id))
|
|
assert len(pages) == 1
|
|
assert pages[0].url == 'http://example.org/'
|
|
assert pages[0].hashtags == ['#hash',]
|
|
|
|
def test_hashtag_links():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
|
brozzler.new_site(frontier, site)
|
|
parent_page = frontier.seed_page(site.id)
|
|
assert not parent_page.hashtags
|
|
outlinks = [
|
|
'http://example.org/#foo',
|
|
'http://example.org/bar',
|
|
'http://example.org/bar#baz',
|
|
'http://example.org/bar#quux',
|
|
'http://example.org/zuh#buh',
|
|
]
|
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
|
|
|
pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url)
|
|
assert len(pages) == 3
|
|
assert pages[0].url == 'http://example.org/'
|
|
assert sorted(pages[0].outlinks['accepted']) == [
|
|
'http://example.org/', 'http://example.org/bar',
|
|
'http://example.org/zuh']
|
|
assert not pages[0].outlinks['blocked']
|
|
assert not pages[0].outlinks['rejected']
|
|
assert pages[0].hashtags == ['#foo',]
|
|
assert pages[0].hops_from_seed == 0
|
|
|
|
assert pages[1].url == 'http://example.org/bar'
|
|
assert sorted(pages[1].hashtags) == ['#baz','#quux']
|
|
assert pages[1].priority == 36
|
|
assert pages[1].hops_from_seed == 1
|
|
|
|
assert pages[2].url == 'http://example.org/zuh'
|
|
assert pages[2].hashtags == ['#buh']
|
|
assert pages[2].priority == 12
|
|
|
|
def test_honor_stop_request():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# 1. test stop request on job
|
|
job_conf = {'seeds': [{'url': 'http://example.com'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert job.id
|
|
sites = list(frontier.job_sites(job.id))
|
|
assert len(sites) == 1
|
|
site = sites[0]
|
|
assert site.job_id == job.id
|
|
|
|
# does not raise exception
|
|
frontier.honor_stop_request(site)
|
|
|
|
# set job.stop_requested
|
|
job.stop_requested = datetime.datetime.utcnow().replace(
|
|
tzinfo=doublethink.UTC)
|
|
job.save()
|
|
with pytest.raises(brozzler.CrawlStopped):
|
|
frontier.honor_stop_request(site)
|
|
|
|
# 2. test stop request on site
|
|
job_conf = {'seeds': [{'url': 'http://example.com'}]}
|
|
job = brozzler.new_job(frontier, job_conf)
|
|
assert job.id
|
|
sites = list(frontier.job_sites(job.id))
|
|
assert len(sites) == 1
|
|
site = sites[0]
|
|
assert site.job_id == job.id
|
|
|
|
# does not raise exception
|
|
frontier.honor_stop_request(site)
|
|
|
|
# set site.stop_requested
|
|
site.stop_requested = doublethink.utcnow()
|
|
site.save()
|
|
with pytest.raises(brozzler.CrawlStopped):
|
|
frontier.honor_stop_request(site)
|
|
|
|
def test_claim_site():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
rr.table('sites').delete().run() # clean slate
|
|
|
|
with pytest.raises(brozzler.NothingToClaim):
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
|
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
|
brozzler.new_site(frontier, site)
|
|
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
assert claimed_site.id == site.id
|
|
assert claimed_site.claimed
|
|
assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1)
|
|
with pytest.raises(brozzler.NothingToClaim):
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
|
|
# site last_claimed less than 1 hour ago still not to be reclaimed
|
|
claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55)
|
|
claimed_site.save()
|
|
with pytest.raises(brozzler.NothingToClaim):
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
|
|
# site last_claimed more than 1 hour ago can be reclaimed
|
|
site = claimed_site
|
|
claimed_site = None
|
|
site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65)
|
|
site.save()
|
|
claimed_site = frontier.claim_site(worker_id='test_claim_site')
|
|
assert claimed_site.id == site.id
|
|
|
|
# clean up
|
|
rr.table('sites').get(claimed_site.id).delete().run()
|
|
|
|
def test_choose_warcprox():
|
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
|
svcreg = doublethink.ServiceRegistry(rr)
|
|
frontier = brozzler.RethinkDbFrontier(rr)
|
|
|
|
# avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
|
rr.table('sites').wait().run()
|
|
rr.table('services').wait().run()
|
|
rr.table('sites').index_wait().run()
|
|
rr.table('services').index_wait().run()
|
|
|
|
# clean slate
|
|
rr.table('sites').delete().run()
|
|
rr.table('services').delete().run()
|
|
worker = brozzler.BrozzlerWorker(frontier, svcreg)
|
|
assert worker._choose_warcprox() is None
|
|
|
|
rr.table('services').insert({
|
|
'role': 'warcprox',
|
|
'first_heartbeat': doublethink.utcnow(),
|
|
'last_heartbeat': doublethink.utcnow(),
|
|
'host': 'host1', 'port': 8000,
|
|
'load': 0, 'ttl': 60}).run()
|
|
rr.table('services').insert({
|
|
'role': 'warcprox',
|
|
'first_heartbeat': doublethink.utcnow(),
|
|
'last_heartbeat': doublethink.utcnow(),
|
|
'host': 'host2', 'port': 8000,
|
|
'load': 0, 'ttl': 60}).run()
|
|
rr.table('services').insert({
|
|
'role': 'warcprox',
|
|
'first_heartbeat': doublethink.utcnow(),
|
|
'last_heartbeat': doublethink.utcnow(),
|
|
'host': 'host2', 'port': 8001,
|
|
'load': 0, 'ttl': 60}).run()
|
|
rr.table('services').insert({
|
|
'role': 'warcprox',
|
|
'first_heartbeat': doublethink.utcnow(),
|
|
'last_heartbeat': doublethink.utcnow(),
|
|
'host': 'host3', 'port': 8000,
|
|
'load': 0, 'ttl': 60}).run()
|
|
rr.table('services').insert({
|
|
'role': 'warcprox',
|
|
'first_heartbeat': doublethink.utcnow(),
|
|
'last_heartbeat': doublethink.utcnow(),
|
|
'host': 'host4', 'port': 8000,
|
|
'load': 1, 'ttl': 60}).run()
|
|
|
|
rr.table('sites').insert({
|
|
'proxy': 'host1:8000', 'status': 'ACTIVE',
|
|
'last_disclaimed': doublethink.utcnow()}).run()
|
|
rr.table('sites').insert({
|
|
'proxy': 'host1:8000', 'status': 'ACTIVE',
|
|
'last_disclaimed': doublethink.utcnow()}).run()
|
|
rr.table('sites').insert({
|
|
'proxy': 'host2:8000', 'status': 'ACTIVE',
|
|
'last_disclaimed': doublethink.utcnow()}).run()
|
|
rr.table('sites').insert({
|
|
'proxy': 'host2:8001', 'status': 'ACTIVE',
|
|
'last_disclaimed': doublethink.utcnow()}).run()
|
|
|
|
instance = worker._choose_warcprox()
|
|
assert instance['host'] == 'host3'
|
|
assert instance['port'] == 8000
|
|
rr.table('sites').insert({
|
|
'proxy': 'host3:8000', 'status': 'ACTIVE',
|
|
'last_disclaimed': doublethink.utcnow()}).run()
|
|
|
|
instance = worker._choose_warcprox()
|
|
assert instance['host'] == 'host4'
|
|
assert instance['port'] == 8000
|
|
|
|
# clean up
|
|
rr.table('sites').delete().run()
|
|
rr.table('services').delete().run()
|