mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: three-value "brozzled" parameter for frontier.site_pages(); fix thing where every Site got a list of all the seeds from the job; and some more frontier tests to catch these kinds of things
This commit is contained in:
commit
a334ff5e69
@ -153,6 +153,11 @@ def _remove_query(url):
|
||||
site_surt_canon = urlcanon.Canonicalizer(
|
||||
urlcanon.semantic.steps + [_remove_query])
|
||||
|
||||
import doublethink
|
||||
import datetime
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||
tzinfo=doublethink.UTC)
|
||||
|
||||
from brozzler.site import Page, Site
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
|
@ -337,11 +337,20 @@ class RethinkDbFrontier:
|
||||
return None
|
||||
return brozzler.Page(self.rr, pages[0])
|
||||
|
||||
def site_pages(self, site_id, unbrozzled_only=False):
|
||||
def site_pages(self, site_id, brozzled=None):
|
||||
'''
|
||||
Args:
|
||||
site_id (str or int):
|
||||
brozzled (bool): if true, results include only pages that have
|
||||
been brozzled at least once; if false, only pages that have
|
||||
not been brozzled; and if None (the default), all pages
|
||||
Returns:
|
||||
iterator of brozzler.Page
|
||||
'''
|
||||
results = self.rr.table("pages").between(
|
||||
[site_id, 0 if unbrozzled_only else r.minval,
|
||||
[site_id, 1 if brozzled is True else 0,
|
||||
r.minval, r.minval],
|
||||
[site_id, 0 if unbrozzled_only else r.maxval,
|
||||
[site_id, 0 if brozzled is False else r.maxval,
|
||||
r.maxval, r.maxval],
|
||||
index="priority_by_site").run()
|
||||
for result in results:
|
||||
|
@ -80,6 +80,7 @@ def new_job(frontier, job_conf):
|
||||
sites = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
merged_conf.pop("seeds")
|
||||
merged_conf["job_id"] = job.id
|
||||
merged_conf["seed"] = merged_conf.pop("url")
|
||||
site = brozzler.Site(frontier.rr, merged_conf)
|
||||
|
@ -26,9 +26,6 @@ import doublethink
|
||||
import datetime
|
||||
import re
|
||||
|
||||
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||
tzinfo=doublethink.UTC)
|
||||
|
||||
class Site(doublethink.Document):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
table = 'sites'
|
||||
@ -41,9 +38,9 @@ class Site(doublethink.Document):
|
||||
if not "claimed" in self:
|
||||
self.claimed = False
|
||||
if not "last_disclaimed" in self:
|
||||
self.last_disclaimed = _EPOCH_UTC
|
||||
self.last_disclaimed = brozzler.EPOCH_UTC
|
||||
if not "last_claimed" in self:
|
||||
self.last_claimed = _EPOCH_UTC
|
||||
self.last_claimed = brozzler.EPOCH_UTC
|
||||
if not "scope" in self:
|
||||
self.scope = {}
|
||||
if not "surt" in self.scope and self.seed:
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev211',
|
||||
version='1.1b9.dev212',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -23,6 +23,7 @@ import logging
|
||||
import argparse
|
||||
import doublethink
|
||||
import time
|
||||
import datetime
|
||||
|
||||
args = argparse.Namespace()
|
||||
args.log_level = logging.INFO
|
||||
@ -34,6 +35,119 @@ def test_rethinkdb_up():
|
||||
tbls = rr.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
def test_basics():
|
||||
rr = doublethink.Rethinker(db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
job_conf = {'seeds': [
|
||||
{'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
|
||||
job = brozzler.new_job(frontier, job_conf)
|
||||
assert job.id
|
||||
assert job.starts_and_stops
|
||||
assert job.starts_and_stops[0]['start']
|
||||
assert job == {
|
||||
'id': job.id,
|
||||
'conf': {
|
||||
'seeds': [
|
||||
{'url': 'http://example.com'},
|
||||
{'url': 'https://example.org/'}
|
||||
]
|
||||
},
|
||||
'status': 'ACTIVE',
|
||||
'starts_and_stops': [
|
||||
{
|
||||
'start': job.starts_and_stops[0]['start'],
|
||||
'stop': None
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
|
||||
assert len(sites) == 2
|
||||
assert sites[0].starts_and_stops[0]['start']
|
||||
assert sites[1].starts_and_stops[0]['start']
|
||||
assert sites[0] == {
|
||||
'claimed': False,
|
||||
'enable_warcprox_features': False,
|
||||
'id': sites[0].id,
|
||||
'job_id': job.id,
|
||||
'last_claimed': brozzler.EPOCH_UTC,
|
||||
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||
'scope': {
|
||||
'surt': 'http://(com,example,)/'
|
||||
},
|
||||
'seed': 'http://example.com',
|
||||
'starts_and_stops': [
|
||||
{
|
||||
'start': sites[0].starts_and_stops[0]['start'],
|
||||
'stop': None
|
||||
}
|
||||
],
|
||||
'status': 'ACTIVE'
|
||||
}
|
||||
assert sites[1] == {
|
||||
'claimed': False,
|
||||
'enable_warcprox_features': False,
|
||||
'id': sites[1].id,
|
||||
'job_id': job.id,
|
||||
'last_claimed': brozzler.EPOCH_UTC,
|
||||
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||
'scope': {
|
||||
'surt': 'https://(org,example,)/',
|
||||
},
|
||||
'seed': 'https://example.org/',
|
||||
'starts_and_stops': [
|
||||
{
|
||||
'start': sites[1].starts_and_stops[0]['start'],
|
||||
'stop': None,
|
||||
},
|
||||
],
|
||||
'status': 'ACTIVE',
|
||||
}
|
||||
|
||||
pages = list(frontier.site_pages(sites[0].id))
|
||||
assert len(pages) == 1
|
||||
assert pages[0] == {
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hops_from_seed': 0,
|
||||
'hops_off_surt': 0,
|
||||
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
||||
'job_id': job.id,
|
||||
'needs_robots_check': True,
|
||||
'priority': 1000,
|
||||
'site_id': sites[0].id,
|
||||
'url': 'http://example.com',
|
||||
}
|
||||
pages = list(frontier.site_pages(sites[1].id))
|
||||
assert len(pages) == 1
|
||||
assert pages[0] == {
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hops_from_seed': 0,
|
||||
'hops_off_surt': 0,
|
||||
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
||||
'job_id': job.id,
|
||||
'needs_robots_check': True,
|
||||
'priority': 1000,
|
||||
'site_id': sites[1].id,
|
||||
'url': 'https://example.org/',
|
||||
}
|
||||
|
||||
# test "brozzled" parameter of frontier.site_pages
|
||||
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
||||
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
|
||||
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
|
||||
pages[0].brozzle_count = 1
|
||||
pages[0].save()
|
||||
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
||||
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
||||
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
||||
pages[0].brozzle_count = 32819
|
||||
pages[0].save()
|
||||
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
||||
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
||||
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
||||
|
||||
def test_resume_job():
|
||||
'''
|
||||
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
||||
|
Loading…
x
Reference in New Issue
Block a user