Merge branch 'master' into qa

* master:
  three-value "brozzled" parameter for frontier.site_pages(); fix thing where every Site got a list of all the seeds from the job; and some more frontier tests to catch these kinds of things
This commit is contained in:
Noah Levitt 2017-03-20 17:28:24 -07:00
commit a334ff5e69
6 changed files with 135 additions and 9 deletions

View File

@ -153,6 +153,11 @@ def _remove_query(url):
site_surt_canon = urlcanon.Canonicalizer(
urlcanon.semantic.steps + [_remove_query])
import doublethink
import datetime
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC)
from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots

View File

@ -337,11 +337,20 @@ class RethinkDbFrontier:
return None
return brozzler.Page(self.rr, pages[0])
def site_pages(self, site_id, unbrozzled_only=False):
def site_pages(self, site_id, brozzled=None):
'''
Args:
site_id (str or int):
brozzled (bool): if true, results include only pages that have
been brozzled at least once; if false, only pages that have
not been brozzled; and if None (the default), all pages
Returns:
iterator of brozzler.Page
'''
results = self.rr.table("pages").between(
[site_id, 0 if unbrozzled_only else r.minval,
[site_id, 1 if brozzled is True else 0,
r.minval, r.minval],
[site_id, 0 if unbrozzled_only else r.maxval,
[site_id, 0 if brozzled is False else r.maxval,
r.maxval, r.maxval],
index="priority_by_site").run()
for result in results:

View File

@ -80,6 +80,7 @@ def new_job(frontier, job_conf):
sites = []
for seed_conf in job_conf["seeds"]:
merged_conf = merge(seed_conf, job_conf)
merged_conf.pop("seeds")
merged_conf["job_id"] = job.id
merged_conf["seed"] = merged_conf.pop("url")
site = brozzler.Site(frontier.rr, merged_conf)

View File

@ -26,9 +26,6 @@ import doublethink
import datetime
import re
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC)
class Site(doublethink.Document):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = 'sites'
@ -41,9 +38,9 @@ class Site(doublethink.Document):
if not "claimed" in self:
self.claimed = False
if not "last_disclaimed" in self:
self.last_disclaimed = _EPOCH_UTC
self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self:
self.last_claimed = _EPOCH_UTC
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
self.scope = {}
if not "surt" in self.scope and self.seed:

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev211',
version='1.1b9.dev212',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -23,6 +23,7 @@ import logging
import argparse
import doublethink
import time
import datetime
args = argparse.Namespace()
args.log_level = logging.INFO
@ -34,6 +35,119 @@ def test_rethinkdb_up():
tbls = rr.table_list().run()
assert len(tbls) > 10
def test_basics():
rr = doublethink.Rethinker(db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
job_conf = {'seeds': [
{'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
job = brozzler.new_job(frontier, job_conf)
assert job.id
assert job.starts_and_stops
assert job.starts_and_stops[0]['start']
assert job == {
'id': job.id,
'conf': {
'seeds': [
{'url': 'http://example.com'},
{'url': 'https://example.org/'}
]
},
'status': 'ACTIVE',
'starts_and_stops': [
{
'start': job.starts_and_stops[0]['start'],
'stop': None
}
]
}
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
assert len(sites) == 2
assert sites[0].starts_and_stops[0]['start']
assert sites[1].starts_and_stops[0]['start']
assert sites[0] == {
'claimed': False,
'enable_warcprox_features': False,
'id': sites[0].id,
'job_id': job.id,
'last_claimed': brozzler.EPOCH_UTC,
'last_disclaimed': brozzler.EPOCH_UTC,
'scope': {
'surt': 'http://(com,example,)/'
},
'seed': 'http://example.com',
'starts_and_stops': [
{
'start': sites[0].starts_and_stops[0]['start'],
'stop': None
}
],
'status': 'ACTIVE'
}
assert sites[1] == {
'claimed': False,
'enable_warcprox_features': False,
'id': sites[1].id,
'job_id': job.id,
'last_claimed': brozzler.EPOCH_UTC,
'last_disclaimed': brozzler.EPOCH_UTC,
'scope': {
'surt': 'https://(org,example,)/',
},
'seed': 'https://example.org/',
'starts_and_stops': [
{
'start': sites[1].starts_and_stops[0]['start'],
'stop': None,
},
],
'status': 'ACTIVE',
}
pages = list(frontier.site_pages(sites[0].id))
assert len(pages) == 1
assert pages[0] == {
'brozzle_count': 0,
'claimed': False,
'hops_from_seed': 0,
'hops_off_surt': 0,
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
'job_id': job.id,
'needs_robots_check': True,
'priority': 1000,
'site_id': sites[0].id,
'url': 'http://example.com',
}
pages = list(frontier.site_pages(sites[1].id))
assert len(pages) == 1
assert pages[0] == {
'brozzle_count': 0,
'claimed': False,
'hops_from_seed': 0,
'hops_off_surt': 0,
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
'job_id': job.id,
'needs_robots_check': True,
'priority': 1000,
'site_id': sites[1].id,
'url': 'https://example.org/',
}
# test "brozzled" parameter of frontier.site_pages
assert len(list(frontier.site_pages(sites[1].id))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
pages[0].brozzle_count = 1
pages[0].save()
assert len(list(frontier.site_pages(sites[1].id))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
pages[0].brozzle_count = 32819
pages[0].save()
assert len(list(frontier.site_pages(sites[1].id))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
def test_resume_job():
'''
Tests that the right stuff gets twiddled in rethinkdb when we "start" and