mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 16:16:28 -04:00
Merge branch 'master' into qa
* master: use %r instead of calling repr()
This commit is contained in:
commit
da6e07fb61
@ -47,9 +47,9 @@ class ReachedLimit(Exception):
|
||||
self.http_payload = http_payload
|
||||
|
||||
def __repr__(self):
|
||||
return "ReachedLimit(warcprox_meta=%s,http_payload=%s)" % (
|
||||
repr(self.warcprox_meta) if hasattr(self, 'warcprox_meta') else None,
|
||||
repr(self.http_payload) if hasattr(self, 'http_payload') else None)
|
||||
return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % (
|
||||
self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
|
||||
self.http_payload if hasattr(self, 'http_payload') else None)
|
||||
|
||||
def __str__(self):
|
||||
return self.__repr__()
|
||||
@ -94,8 +94,8 @@ def behavior_script(url, template_parameters=None):
|
||||
behavior['behavior_js_template'])
|
||||
script = template.render(parameters)
|
||||
logging.info(
|
||||
'using template=%s populated with parameters=%s for %s',
|
||||
repr(behavior['behavior_js_template']), parameters, url)
|
||||
'using template=%r populated with parameters=%r for %r',
|
||||
behavior['behavior_js_template'], parameters, url)
|
||||
return script
|
||||
return None
|
||||
|
||||
@ -129,8 +129,8 @@ class ThreadExceptionGate:
|
||||
with self.lock:
|
||||
if self.pending_exception:
|
||||
self.logger.warn(
|
||||
'%s already pending for thread %s, discarding %s',
|
||||
repr(self.pending_exception), self.thread, repr(e))
|
||||
'%r already pending for thread %r, discarding %r',
|
||||
self.pending_exception, self.thread, e)
|
||||
else:
|
||||
self.pending_exception = e
|
||||
|
||||
|
@ -137,8 +137,7 @@ class Chrome:
|
||||
if proxy:
|
||||
chrome_args.append('--proxy-server=%s' % proxy)
|
||||
chrome_args.append('about:blank')
|
||||
self.logger.info(
|
||||
'running: %s', repr(subprocess.list2cmdline(chrome_args)))
|
||||
self.logger.info('running: %r', subprocess.list2cmdline(chrome_args))
|
||||
# start_new_session - new process group so we can kill the whole group
|
||||
self.chrome_process = subprocess.Popen(
|
||||
chrome_args, env=new_env, start_new_session=True,
|
||||
@ -173,7 +172,7 @@ class Chrome:
|
||||
return url
|
||||
except brozzler.ShutdownRequested:
|
||||
raise
|
||||
except BaseException as e:
|
||||
except Exception as e:
|
||||
if time.time() - self._last_warning > 30:
|
||||
self.logger.warn(
|
||||
'problem with %s (will keep trying until timeout '
|
||||
|
@ -413,7 +413,7 @@ def brozzler_list_jobs(argv=None):
|
||||
if result:
|
||||
results = [reql.run()]
|
||||
else:
|
||||
logging.error('no such job with id %s', repr(job_id))
|
||||
logging.error('no such job with id %r', job_id)
|
||||
sys.exit(1)
|
||||
else:
|
||||
reql = rr.table('jobs').order_by('id')
|
||||
@ -657,7 +657,7 @@ def brozzler_stop_crawl(argv=None):
|
||||
job_id = args.job_id
|
||||
job = brozzler.Job.load(rr, job_id)
|
||||
if not job:
|
||||
logging.fatal('job not found with id=%s', repr(job_id))
|
||||
logging.fatal('job not found with id=%r', job_id)
|
||||
sys.exit(1)
|
||||
job.stop_requested = doublethink.utcnow()
|
||||
job.save()
|
||||
@ -668,7 +668,7 @@ def brozzler_stop_crawl(argv=None):
|
||||
site_id = args.site_id
|
||||
site = brozzler.Site.load(rr, site_id)
|
||||
if not site:
|
||||
logging.fatal('site not found with id=%s', repr(site_id))
|
||||
logging.fatal('site not found with id=%r', site_id)
|
||||
sys.exit(1)
|
||||
site.stop_requested = doublethink.utcnow()
|
||||
site.save()
|
||||
|
@ -40,14 +40,13 @@ class RethinkDbFrontier:
|
||||
def _ensure_db(self):
|
||||
dbs = self.rr.db_list().run()
|
||||
if not self.rr.dbname in dbs:
|
||||
self.logger.info(
|
||||
"creating rethinkdb database %s", repr(self.rr.dbname))
|
||||
self.logger.info("creating rethinkdb database %r", self.rr.dbname)
|
||||
self.rr.db_create(self.rr.dbname).run()
|
||||
tables = self.rr.table_list().run()
|
||||
if not "sites" in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'sites' in database %s",
|
||||
repr(self.rr.dbname))
|
||||
"creating rethinkdb table 'sites' in database %r",
|
||||
self.rr.dbname)
|
||||
self.rr.table_create(
|
||||
"sites", shards=self.shards, replicas=self.replicas).run()
|
||||
self.rr.table("sites").index_create("sites_last_disclaimed", [
|
||||
@ -55,8 +54,8 @@ class RethinkDbFrontier:
|
||||
self.rr.table("sites").index_create("job_id").run()
|
||||
if not "pages" in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'pages' in database %s",
|
||||
repr(self.rr.dbname))
|
||||
"creating rethinkdb table 'pages' in database %r",
|
||||
self.rr.dbname)
|
||||
self.rr.table_create(
|
||||
"pages", shards=self.shards, replicas=self.replicas).run()
|
||||
self.rr.table("pages").index_create("priority_by_site", [
|
||||
@ -69,8 +68,8 @@ class RethinkDbFrontier:
|
||||
r.row["hops_from_seed"]]).run()
|
||||
if not "jobs" in tables:
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'jobs' in database %s",
|
||||
repr(self.rr.dbname))
|
||||
"creating rethinkdb table 'jobs' in database %r",
|
||||
self.rr.dbname)
|
||||
self.rr.table_create(
|
||||
"jobs", shards=self.shards, replicas=self.replicas).run()
|
||||
|
||||
@ -86,10 +85,13 @@ class RethinkDbFrontier:
|
||||
expected = 0
|
||||
if isinstance(expected, list):
|
||||
if result.get(k) not in kwargs[k]:
|
||||
raise UnexpectedDbResult("expected {} to be one of {} in {}".format(repr(k), expected, result))
|
||||
raise UnexpectedDbResult(
|
||||
"expected %r to be one of %r in %r" % (
|
||||
k, expected, result))
|
||||
else:
|
||||
if result.get(k) != expected:
|
||||
raise UnexpectedDbResult("expected {} to be {} in {}".format(repr(k), expected, result))
|
||||
raise UnexpectedDbResult("expected %r to be %r in %r" % (
|
||||
k, expected, result))
|
||||
|
||||
def claim_site(self, worker_id):
|
||||
# XXX keep track of aggregate priority and prioritize sites accordingly?
|
||||
|
@ -112,12 +112,12 @@ def is_permitted_by_robots(site, url, proxy=None):
|
||||
else:
|
||||
if tries_left > 0:
|
||||
logging.warn(
|
||||
"caught exception fetching robots.txt (%s tries "
|
||||
"left) for %s: %s", tries_left, url, repr(e))
|
||||
"caught exception fetching robots.txt (%r tries "
|
||||
"left) for %r: %r", tries_left, url, e)
|
||||
tries_left -= 1
|
||||
else:
|
||||
logging.error(
|
||||
"caught exception fetching robots.txt (0 tries "
|
||||
"left) for %s: %s", url, repr(e), exc_info=True)
|
||||
"left) for %r: %r", url, e, exc_info=True)
|
||||
return False
|
||||
|
||||
|
@ -135,8 +135,8 @@ class BrozzlerWorker:
|
||||
site.proxy = '%s:%s' % (svc['host'], svc['port'])
|
||||
site.save()
|
||||
self.logger.info(
|
||||
'chose warcprox instance %s from service registry for %s',
|
||||
repr(site.proxy), site)
|
||||
'chose warcprox instance %r from service registry for %r',
|
||||
site.proxy, site)
|
||||
return site.proxy
|
||||
return None
|
||||
|
||||
@ -428,8 +428,8 @@ class BrozzlerWorker:
|
||||
page = None
|
||||
self._frontier.honor_stop_request(site)
|
||||
self.logger.info(
|
||||
"brozzling site (proxy=%s) %s",
|
||||
repr(self._proxy_for(site)), site)
|
||||
"brozzling site (proxy=%r) %r",
|
||||
self._proxy_for(site), site)
|
||||
start = time.time()
|
||||
while time.time() - start < 7 * 60:
|
||||
site.refresh()
|
||||
@ -473,7 +473,7 @@ class BrozzlerWorker:
|
||||
# using brozzler-worker --proxy, nothing to do but try the
|
||||
# same proxy again next time
|
||||
logging.error(
|
||||
'proxy error (site.proxy=%s): %s', repr(site.proxy), e)
|
||||
'proxy error (site.proxy=%r): %r', site.proxy, e)
|
||||
except:
|
||||
self.logger.critical("unexpected exception", exc_info=True)
|
||||
finally:
|
||||
|
Loading…
x
Reference in New Issue
Block a user