mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-05-02 14:46:18 -04:00
three-value "brozzled" parameter for frontier.site_pages(); fix thing where every Site got a list of all the seeds from the job; and some more frontier tests to catch these kinds of things
This commit is contained in:
parent
0e9f4a0c26
commit
eeee523b18
6 changed files with 135 additions and 9 deletions
|
@ -153,6 +153,11 @@ def _remove_query(url):
|
||||||
site_surt_canon = urlcanon.Canonicalizer(
|
site_surt_canon = urlcanon.Canonicalizer(
|
||||||
urlcanon.semantic.steps + [_remove_query])
|
urlcanon.semantic.steps + [_remove_query])
|
||||||
|
|
||||||
|
import doublethink
|
||||||
|
import datetime
|
||||||
|
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||||
|
tzinfo=doublethink.UTC)
|
||||||
|
|
||||||
from brozzler.site import Page, Site
|
from brozzler.site import Page, Site
|
||||||
from brozzler.worker import BrozzlerWorker
|
from brozzler.worker import BrozzlerWorker
|
||||||
from brozzler.robots import is_permitted_by_robots
|
from brozzler.robots import is_permitted_by_robots
|
||||||
|
|
|
@ -337,11 +337,20 @@ class RethinkDbFrontier:
|
||||||
return None
|
return None
|
||||||
return brozzler.Page(self.rr, pages[0])
|
return brozzler.Page(self.rr, pages[0])
|
||||||
|
|
||||||
def site_pages(self, site_id, unbrozzled_only=False):
|
def site_pages(self, site_id, brozzled=None):
|
||||||
|
'''
|
||||||
|
Args:
|
||||||
|
site_id (str or int):
|
||||||
|
brozzled (bool): if true, results include only pages that have
|
||||||
|
been brozzled at least once; if false, only pages that have
|
||||||
|
not been brozzled; and if None (the default), all pages
|
||||||
|
Returns:
|
||||||
|
iterator of brozzler.Page
|
||||||
|
'''
|
||||||
results = self.rr.table("pages").between(
|
results = self.rr.table("pages").between(
|
||||||
[site_id, 0 if unbrozzled_only else r.minval,
|
[site_id, 1 if brozzled is True else 0,
|
||||||
r.minval, r.minval],
|
r.minval, r.minval],
|
||||||
[site_id, 0 if unbrozzled_only else r.maxval,
|
[site_id, 0 if brozzled is False else r.maxval,
|
||||||
r.maxval, r.maxval],
|
r.maxval, r.maxval],
|
||||||
index="priority_by_site").run()
|
index="priority_by_site").run()
|
||||||
for result in results:
|
for result in results:
|
||||||
|
|
|
@ -80,6 +80,7 @@ def new_job(frontier, job_conf):
|
||||||
sites = []
|
sites = []
|
||||||
for seed_conf in job_conf["seeds"]:
|
for seed_conf in job_conf["seeds"]:
|
||||||
merged_conf = merge(seed_conf, job_conf)
|
merged_conf = merge(seed_conf, job_conf)
|
||||||
|
merged_conf.pop("seeds")
|
||||||
merged_conf["job_id"] = job.id
|
merged_conf["job_id"] = job.id
|
||||||
merged_conf["seed"] = merged_conf.pop("url")
|
merged_conf["seed"] = merged_conf.pop("url")
|
||||||
site = brozzler.Site(frontier.rr, merged_conf)
|
site = brozzler.Site(frontier.rr, merged_conf)
|
||||||
|
|
|
@ -26,9 +26,6 @@ import doublethink
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
|
|
||||||
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
|
||||||
tzinfo=doublethink.UTC)
|
|
||||||
|
|
||||||
class Site(doublethink.Document):
|
class Site(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = 'sites'
|
table = 'sites'
|
||||||
|
@ -41,9 +38,9 @@ class Site(doublethink.Document):
|
||||||
if not "claimed" in self:
|
if not "claimed" in self:
|
||||||
self.claimed = False
|
self.claimed = False
|
||||||
if not "last_disclaimed" in self:
|
if not "last_disclaimed" in self:
|
||||||
self.last_disclaimed = _EPOCH_UTC
|
self.last_disclaimed = brozzler.EPOCH_UTC
|
||||||
if not "last_claimed" in self:
|
if not "last_claimed" in self:
|
||||||
self.last_claimed = _EPOCH_UTC
|
self.last_claimed = brozzler.EPOCH_UTC
|
||||||
if not "scope" in self:
|
if not "scope" in self:
|
||||||
self.scope = {}
|
self.scope = {}
|
||||||
if not "surt" in self.scope and self.seed:
|
if not "surt" in self.scope and self.seed:
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev211',
|
version='1.1b9.dev212',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -23,6 +23,7 @@ import logging
|
||||||
import argparse
|
import argparse
|
||||||
import doublethink
|
import doublethink
|
||||||
import time
|
import time
|
||||||
|
import datetime
|
||||||
|
|
||||||
args = argparse.Namespace()
|
args = argparse.Namespace()
|
||||||
args.log_level = logging.INFO
|
args.log_level = logging.INFO
|
||||||
|
@ -34,6 +35,119 @@ def test_rethinkdb_up():
|
||||||
tbls = rr.table_list().run()
|
tbls = rr.table_list().run()
|
||||||
assert len(tbls) > 10
|
assert len(tbls) > 10
|
||||||
|
|
||||||
|
def test_basics():
|
||||||
|
rr = doublethink.Rethinker(db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
job_conf = {'seeds': [
|
||||||
|
{'url': 'http://example.com'}, {'url': 'https://example.org/'}]}
|
||||||
|
job = brozzler.new_job(frontier, job_conf)
|
||||||
|
assert job.id
|
||||||
|
assert job.starts_and_stops
|
||||||
|
assert job.starts_and_stops[0]['start']
|
||||||
|
assert job == {
|
||||||
|
'id': job.id,
|
||||||
|
'conf': {
|
||||||
|
'seeds': [
|
||||||
|
{'url': 'http://example.com'},
|
||||||
|
{'url': 'https://example.org/'}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'status': 'ACTIVE',
|
||||||
|
'starts_and_stops': [
|
||||||
|
{
|
||||||
|
'start': job.starts_and_stops[0]['start'],
|
||||||
|
'stop': None
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed)
|
||||||
|
assert len(sites) == 2
|
||||||
|
assert sites[0].starts_and_stops[0]['start']
|
||||||
|
assert sites[1].starts_and_stops[0]['start']
|
||||||
|
assert sites[0] == {
|
||||||
|
'claimed': False,
|
||||||
|
'enable_warcprox_features': False,
|
||||||
|
'id': sites[0].id,
|
||||||
|
'job_id': job.id,
|
||||||
|
'last_claimed': brozzler.EPOCH_UTC,
|
||||||
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||||
|
'scope': {
|
||||||
|
'surt': 'http://(com,example,)/'
|
||||||
|
},
|
||||||
|
'seed': 'http://example.com',
|
||||||
|
'starts_and_stops': [
|
||||||
|
{
|
||||||
|
'start': sites[0].starts_and_stops[0]['start'],
|
||||||
|
'stop': None
|
||||||
|
}
|
||||||
|
],
|
||||||
|
'status': 'ACTIVE'
|
||||||
|
}
|
||||||
|
assert sites[1] == {
|
||||||
|
'claimed': False,
|
||||||
|
'enable_warcprox_features': False,
|
||||||
|
'id': sites[1].id,
|
||||||
|
'job_id': job.id,
|
||||||
|
'last_claimed': brozzler.EPOCH_UTC,
|
||||||
|
'last_disclaimed': brozzler.EPOCH_UTC,
|
||||||
|
'scope': {
|
||||||
|
'surt': 'https://(org,example,)/',
|
||||||
|
},
|
||||||
|
'seed': 'https://example.org/',
|
||||||
|
'starts_and_stops': [
|
||||||
|
{
|
||||||
|
'start': sites[1].starts_and_stops[0]['start'],
|
||||||
|
'stop': None,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
'status': 'ACTIVE',
|
||||||
|
}
|
||||||
|
|
||||||
|
pages = list(frontier.site_pages(sites[0].id))
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0] == {
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'claimed': False,
|
||||||
|
'hops_from_seed': 0,
|
||||||
|
'hops_off_surt': 0,
|
||||||
|
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
||||||
|
'job_id': job.id,
|
||||||
|
'needs_robots_check': True,
|
||||||
|
'priority': 1000,
|
||||||
|
'site_id': sites[0].id,
|
||||||
|
'url': 'http://example.com',
|
||||||
|
}
|
||||||
|
pages = list(frontier.site_pages(sites[1].id))
|
||||||
|
assert len(pages) == 1
|
||||||
|
assert pages[0] == {
|
||||||
|
'brozzle_count': 0,
|
||||||
|
'claimed': False,
|
||||||
|
'hops_from_seed': 0,
|
||||||
|
'hops_off_surt': 0,
|
||||||
|
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
||||||
|
'job_id': job.id,
|
||||||
|
'needs_robots_check': True,
|
||||||
|
'priority': 1000,
|
||||||
|
'site_id': sites[1].id,
|
||||||
|
'url': 'https://example.org/',
|
||||||
|
}
|
||||||
|
|
||||||
|
# test "brozzled" parameter of frontier.site_pages
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1
|
||||||
|
pages[0].brozzle_count = 1
|
||||||
|
pages[0].save()
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
||||||
|
pages[0].brozzle_count = 32819
|
||||||
|
pages[0].save()
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id))) == 1
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1
|
||||||
|
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
||||||
|
|
||||||
def test_resume_job():
|
def test_resume_job():
|
||||||
'''
|
'''
|
||||||
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue