mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 20:34:09 -04:00
use updated doublethink library populate_defaults() to avoid problem where under certain circumstances field values from the database would be overwritten by defaults
This commit is contained in:
parent
242ff51ec7
commit
01653c01d7
4 changed files with 123 additions and 54 deletions
|
@ -71,8 +71,8 @@ def new_job(frontier, job_conf):
|
||||||
'''Returns new Job.'''
|
'''Returns new Job.'''
|
||||||
validate_conf(job_conf)
|
validate_conf(job_conf)
|
||||||
job = Job(frontier.rr, {
|
job = Job(frontier.rr, {
|
||||||
"conf": job_conf,
|
"conf": job_conf, "status": "ACTIVE",
|
||||||
"status": "ACTIVE", "started": doublethink.utcnow()})
|
"started": doublethink.utcnow()})
|
||||||
if "id" in job_conf:
|
if "id" in job_conf:
|
||||||
job.id = job_conf["id"]
|
job.id = job_conf["id"]
|
||||||
job.save()
|
job.save()
|
||||||
|
@ -80,22 +80,9 @@ def new_job(frontier, job_conf):
|
||||||
sites = []
|
sites = []
|
||||||
for seed_conf in job_conf["seeds"]:
|
for seed_conf in job_conf["seeds"]:
|
||||||
merged_conf = merge(seed_conf, job_conf)
|
merged_conf = merge(seed_conf, job_conf)
|
||||||
site = brozzler.Site(frontier.rr, {
|
merged_conf["job_id"] = job.id
|
||||||
"job_id": job.id,
|
merged_conf["seed"] = merged_conf.pop("url")
|
||||||
"seed": merged_conf["url"],
|
site = brozzler.Site(frontier.rr, merged_conf)
|
||||||
"scope": merged_conf.get("scope"),
|
|
||||||
"time_limit": merged_conf.get("time_limit"),
|
|
||||||
"proxy": merged_conf.get("proxy"),
|
|
||||||
"ignore_robots": merged_conf.get("ignore_robots"),
|
|
||||||
"enable_warcprox_features": merged_conf.get(
|
|
||||||
"enable_warcprox_features"),
|
|
||||||
"warcprox_meta": merged_conf.get("warcprox_meta"),
|
|
||||||
"metadata": merged_conf.get("metadata"),
|
|
||||||
"remember_outlinks": merged_conf.get("remember_outlinks"),
|
|
||||||
"user_agent": merged_conf.get("user_agent"),
|
|
||||||
"behavior_parameters": merged_conf.get("behavior_parameters"),
|
|
||||||
"username": merged_conf.get("username"),
|
|
||||||
"password": merged_conf.get("password")})
|
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
|
|
||||||
for site in sites:
|
for site in sites:
|
||||||
|
@ -127,9 +114,9 @@ class Job(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = "jobs"
|
table = "jobs"
|
||||||
|
|
||||||
def __init__(self, rr, d={}):
|
def populate_defaults(self):
|
||||||
doublethink.Document.__init__(self, rr, d)
|
if not "status" in self:
|
||||||
self.status = self.get("status", "ACTIVE")
|
self.status = "ACTIVE"
|
||||||
if not "starts_and_stops" in self:
|
if not "starts_and_stops" in self:
|
||||||
if self.get("started"): # backward compatibility
|
if self.get("started"): # backward compatibility
|
||||||
self.starts_and_stops = [{
|
self.starts_and_stops = [{
|
||||||
|
|
|
@ -92,28 +92,32 @@ class Site(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = 'sites'
|
table = 'sites'
|
||||||
|
|
||||||
def __init__(self, rr, d={}):
|
def populate_defaults(self):
|
||||||
doublethink.Document.__init__(self, rr, d)
|
if not "status" in self:
|
||||||
if not self.get('status'):
|
self.status = "ACTIVE"
|
||||||
self.status = 'ACTIVE'
|
if not "enable_warcprox_features" in self:
|
||||||
self.enable_warcprox_features = bool(self.get('enable_warcprox_features'))
|
self.enable_warcprox_features = False
|
||||||
self.claimed = bool(self.get('claimed'))
|
if not "claimed" in self:
|
||||||
self.last_disclaimed = self.get('last_disclaimed', _EPOCH_UTC)
|
self.claimed = False
|
||||||
self.last_claimed = self.get('last_claimed', _EPOCH_UTC)
|
if not "last_disclaimed" in self:
|
||||||
if not self.get('starts_and_stops'):
|
self.last_disclaimed = _EPOCH_UTC
|
||||||
if self.get('start_time'): # backward compatibility
|
if not "last_claimed" in self:
|
||||||
|
self.last_claimed = _EPOCH_UTC
|
||||||
|
if not "scope" in self:
|
||||||
|
self.scope = {}
|
||||||
|
if not "surt" in self.scope:
|
||||||
|
self.scope["surt"] = Url(self.seed).surt
|
||||||
|
|
||||||
|
if not "starts_and_stops" in self:
|
||||||
|
if self.get("start_time"): # backward compatibility
|
||||||
self.starts_and_stops = [{
|
self.starts_and_stops = [{
|
||||||
'start':self.get('start_time'),'stop':None}]
|
"start":self.get("start_time"),"stop":None}]
|
||||||
if self.get('status') != 'ACTIVE':
|
if self.get("status") != "ACTIVE":
|
||||||
self.starts_and_stops[0]['stop'] = self.last_disclaimed
|
self.starts_and_stops[0]["stop"] = self.last_disclaimed
|
||||||
del self['start_time']
|
del self["start_time"]
|
||||||
else:
|
else:
|
||||||
self.starts_and_stops = [
|
self.starts_and_stops = [
|
||||||
{'start':doublethink.utcnow(),'stop':None}]
|
{"start":doublethink.utcnow(),"stop":None}]
|
||||||
if not self.scope:
|
|
||||||
self.scope = {}
|
|
||||||
if not 'surt' in self.scope:
|
|
||||||
self.scope['surt'] = Url(self.seed).surt
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)
|
||||||
|
@ -286,19 +290,22 @@ class Site(doublethink.Document):
|
||||||
|
|
||||||
class Page(doublethink.Document):
|
class Page(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = 'pages'
|
table = "pages"
|
||||||
|
|
||||||
def __init__(self, rr, d={}):
|
def populate_defaults(self):
|
||||||
doublethink.Document.__init__(self, rr, d)
|
if not "hops_from_seed" in self:
|
||||||
self.hops_from_seed = self.get('hops_from_seed', 0)
|
self.hops_from_seed = 0
|
||||||
self.brozzle_count = self.get('brozzle_count', 0)
|
if not "brozzle_count" in self:
|
||||||
self.claimed = self.get('claimed', False)
|
self.brozzle_count = 0
|
||||||
self.hops_off_surt = self.get('hops_off_surt', 0)
|
if not "claimed" in self:
|
||||||
self.needs_robots_check = self.get('needs_robots_check', False)
|
self.claimed = False
|
||||||
self._canon_hurl = None
|
if not "hops_off_surt" in self:
|
||||||
|
self.hops_off_surt = 0
|
||||||
self.priority = self.get('priority', self._calc_priority())
|
if not "needs_robots_check" in self:
|
||||||
if self.get('id') is None:
|
self.needs_robots_check = False
|
||||||
|
if not "priority" in self:
|
||||||
|
self.priority = self._calc_priority()
|
||||||
|
if not "id" in self:
|
||||||
digest_this = "site_id:%s,url:%s" % (self.site_id, self.url)
|
digest_this = "site_id:%s,url:%s" % (self.site_id, self.url)
|
||||||
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev199',
|
version='1.1b9.dev200',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -69,7 +69,7 @@ setuptools.setup(
|
||||||
'websocket-client!=0.39.0',
|
'websocket-client!=0.39.0',
|
||||||
'pillow==3.3.0',
|
'pillow==3.3.0',
|
||||||
'surt>=0.3.0',
|
'surt>=0.3.0',
|
||||||
'doublethink>=0.2.0.dev66',
|
'doublethink>=0.2.0.dev70',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'cerberus==1.0.1',
|
'cerberus==1.0.1',
|
||||||
'jinja2',
|
'jinja2',
|
||||||
|
|
|
@ -172,3 +172,78 @@ def test_time_limit():
|
||||||
assert site.starts_and_stops[1]['start']
|
assert site.starts_and_stops[1]['start']
|
||||||
assert site.starts_and_stops[1]['stop']
|
assert site.starts_and_stops[1]['stop']
|
||||||
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
||||||
|
|
||||||
|
def test_field_defaults():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
|
||||||
|
# page
|
||||||
|
brozzler.Page.table_ensure(rr)
|
||||||
|
page = brozzler.Page(rr, {'hops_from_seed': 3})
|
||||||
|
assert page.hops_from_seed == 3
|
||||||
|
assert page.id is None
|
||||||
|
assert page.brozzle_count is None
|
||||||
|
page.save()
|
||||||
|
assert page.hops_from_seed == 3
|
||||||
|
assert page.id
|
||||||
|
assert page.brozzle_count == 0
|
||||||
|
|
||||||
|
qage = brozzler.Page.load(rr, page.id)
|
||||||
|
assert qage.hops_from_seed == 3
|
||||||
|
assert qage.id == page.id
|
||||||
|
assert qage.brozzle_count == 0
|
||||||
|
qage.save()
|
||||||
|
assert qage.hops_from_seed == 3
|
||||||
|
assert qage.id == page.id
|
||||||
|
assert qage.brozzle_count == 0
|
||||||
|
qage.refresh()
|
||||||
|
assert qage.hops_from_seed == 3
|
||||||
|
assert qage.id == page.id
|
||||||
|
assert qage.brozzle_count == 0
|
||||||
|
|
||||||
|
# site
|
||||||
|
brozzler.Site.table_ensure(rr)
|
||||||
|
site = brozzler.Site(rr, {'enable_warcprox_features': True})
|
||||||
|
assert site.enable_warcprox_features is True
|
||||||
|
assert site.id is None
|
||||||
|
assert site.scope is None
|
||||||
|
site.save()
|
||||||
|
assert site.id
|
||||||
|
assert site.scope
|
||||||
|
|
||||||
|
tite = brozzler.Site.load(rr, site.id)
|
||||||
|
assert tite.enable_warcprox_features is True
|
||||||
|
assert tite.id == site.id
|
||||||
|
assert tite.scope == site.scope
|
||||||
|
tite.save()
|
||||||
|
assert tite.enable_warcprox_features is True
|
||||||
|
assert tite.id == site.id
|
||||||
|
assert tite.scope == site.scope
|
||||||
|
tite.refresh()
|
||||||
|
assert tite.enable_warcprox_features is True
|
||||||
|
assert tite.id == site.id
|
||||||
|
assert tite.scope == site.scope
|
||||||
|
|
||||||
|
# job
|
||||||
|
brozzler.Job.table_ensure(rr)
|
||||||
|
job = brozzler.Job(rr, {'status': 'WHUUUT'})
|
||||||
|
assert job.status == 'WHUUUT'
|
||||||
|
assert job.id is None
|
||||||
|
assert job.starts_and_stops is None
|
||||||
|
job.save()
|
||||||
|
assert job.status == 'WHUUUT'
|
||||||
|
assert job.id
|
||||||
|
assert job.starts_and_stops
|
||||||
|
|
||||||
|
kob = brozzler.Job.load(rr, job.id)
|
||||||
|
assert kob.status == 'WHUUUT'
|
||||||
|
assert kob.id
|
||||||
|
assert kob.starts_and_stops
|
||||||
|
kob.save()
|
||||||
|
assert kob.status == 'WHUUUT'
|
||||||
|
assert kob.id
|
||||||
|
assert kob.starts_and_stops
|
||||||
|
kob.refresh()
|
||||||
|
assert kob.status == 'WHUUUT'
|
||||||
|
assert kob.id
|
||||||
|
assert kob.starts_and_stops
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue