diff --git a/brozzler/__init__.py b/brozzler/__init__.py index aefbaf0..1c410ab 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -47,9 +47,9 @@ class ReachedLimit(Exception): self.http_payload = http_payload def __repr__(self): - return "ReachedLimit(warcprox_meta=%s,http_payload=%s)" % ( - repr(self.warcprox_meta) if hasattr(self, 'warcprox_meta') else None, - repr(self.http_payload) if hasattr(self, 'http_payload') else None) + return "ReachedLimit(warcprox_meta=%r,http_payload=%r)" % ( + self.warcprox_meta if hasattr(self, 'warcprox_meta') else None, + self.http_payload if hasattr(self, 'http_payload') else None) def __str__(self): return self.__repr__() @@ -94,8 +94,8 @@ def behavior_script(url, template_parameters=None): behavior['behavior_js_template']) script = template.render(parameters) logging.info( - 'using template=%s populated with parameters=%s for %s', - repr(behavior['behavior_js_template']), parameters, url) + 'using template=%r populated with parameters=%r for %r', + behavior['behavior_js_template'], parameters, url) return script return None @@ -129,8 +129,8 @@ class ThreadExceptionGate: with self.lock: if self.pending_exception: self.logger.warn( - '%s already pending for thread %s, discarding %s', - repr(self.pending_exception), self.thread, repr(e)) + '%r already pending for thread %r, discarding %r', + self.pending_exception, self.thread, e) else: self.pending_exception = e diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 87316cf..23708d7 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -137,8 +137,7 @@ class Chrome: if proxy: chrome_args.append('--proxy-server=%s' % proxy) chrome_args.append('about:blank') - self.logger.info( - 'running: %s', repr(subprocess.list2cmdline(chrome_args))) + self.logger.info('running: %r', subprocess.list2cmdline(chrome_args)) # start_new_session - new process group so we can kill the whole group self.chrome_process = subprocess.Popen( chrome_args, env=new_env, start_new_session=True, @@ -173,7 +172,7 @@ class Chrome: return url except brozzler.ShutdownRequested: raise - except BaseException as e: + except Exception as e: if time.time() - self._last_warning > 30: self.logger.warn( 'problem with %s (will keep trying until timeout ' diff --git a/brozzler/cli.py b/brozzler/cli.py index 8c9faaa..6dab55b 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -413,7 +413,7 @@ def brozzler_list_jobs(argv=None): if result: results = [reql.run()] else: - logging.error('no such job with id %s', repr(job_id)) + logging.error('no such job with id %r', job_id) sys.exit(1) else: reql = rr.table('jobs').order_by('id') @@ -657,7 +657,7 @@ def brozzler_stop_crawl(argv=None): job_id = args.job_id job = brozzler.Job.load(rr, job_id) if not job: - logging.fatal('job not found with id=%s', repr(job_id)) + logging.fatal('job not found with id=%r', job_id) sys.exit(1) job.stop_requested = doublethink.utcnow() job.save() @@ -668,7 +668,7 @@ def brozzler_stop_crawl(argv=None): site_id = args.site_id site = brozzler.Site.load(rr, site_id) if not site: - logging.fatal('site not found with id=%s', repr(site_id)) + logging.fatal('site not found with id=%r', site_id) sys.exit(1) site.stop_requested = doublethink.utcnow() site.save() diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 8adaab1..86bfe48 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -40,14 +40,13 @@ class RethinkDbFrontier: def _ensure_db(self): dbs = self.rr.db_list().run() if not self.rr.dbname in dbs: - self.logger.info( - "creating rethinkdb database %s", repr(self.rr.dbname)) + self.logger.info("creating rethinkdb database %r", self.rr.dbname) self.rr.db_create(self.rr.dbname).run() tables = self.rr.table_list().run() if not "sites" in tables: self.logger.info( - "creating rethinkdb table 'sites' in database %s", - repr(self.rr.dbname)) + "creating rethinkdb table 'sites' in database %r", + self.rr.dbname) self.rr.table_create( "sites", shards=self.shards, replicas=self.replicas).run() self.rr.table("sites").index_create("sites_last_disclaimed", [ @@ -55,8 +54,8 @@ class RethinkDbFrontier: self.rr.table("sites").index_create("job_id").run() if not "pages" in tables: self.logger.info( - "creating rethinkdb table 'pages' in database %s", - repr(self.rr.dbname)) + "creating rethinkdb table 'pages' in database %r", + self.rr.dbname) self.rr.table_create( "pages", shards=self.shards, replicas=self.replicas).run() self.rr.table("pages").index_create("priority_by_site", [ @@ -69,8 +68,8 @@ class RethinkDbFrontier: r.row["hops_from_seed"]]).run() if not "jobs" in tables: self.logger.info( - "creating rethinkdb table 'jobs' in database %s", - repr(self.rr.dbname)) + "creating rethinkdb table 'jobs' in database %r", + self.rr.dbname) self.rr.table_create( "jobs", shards=self.shards, replicas=self.replicas).run() @@ -86,10 +85,13 @@ class RethinkDbFrontier: expected = 0 if isinstance(expected, list): if result.get(k) not in kwargs[k]: - raise UnexpectedDbResult("expected {} to be one of {} in {}".format(repr(k), expected, result)) + raise UnexpectedDbResult( + "expected %r to be one of %r in %r" % ( + k, expected, result)) else: if result.get(k) != expected: - raise UnexpectedDbResult("expected {} to be {} in {}".format(repr(k), expected, result)) + raise UnexpectedDbResult("expected %r to be %r in %r" % ( + k, expected, result)) def claim_site(self, worker_id): # XXX keep track of aggregate priority and prioritize sites accordingly? diff --git a/brozzler/robots.py b/brozzler/robots.py index 046ef22..aef9913 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -112,12 +112,12 @@ def is_permitted_by_robots(site, url, proxy=None): else: if tries_left > 0: logging.warn( - "caught exception fetching robots.txt (%s tries " - "left) for %s: %s", tries_left, url, repr(e)) + "caught exception fetching robots.txt (%r tries " + "left) for %r: %r", tries_left, url, e) tries_left -= 1 else: logging.error( "caught exception fetching robots.txt (0 tries " - "left) for %s: %s", url, repr(e), exc_info=True) + "left) for %r: %r", url, e, exc_info=True) return False diff --git a/brozzler/worker.py b/brozzler/worker.py index 6cf6cd7..19a8d34 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -135,8 +135,8 @@ class BrozzlerWorker: site.proxy = '%s:%s' % (svc['host'], svc['port']) site.save() self.logger.info( - 'chose warcprox instance %s from service registry for %s', - repr(site.proxy), site) + 'chose warcprox instance %r from service registry for %r', + site.proxy, site) return site.proxy return None @@ -428,8 +428,8 @@ class BrozzlerWorker: page = None self._frontier.honor_stop_request(site) self.logger.info( - "brozzling site (proxy=%s) %s", - repr(self._proxy_for(site)), site) + "brozzling site (proxy=%r) %r", + self._proxy_for(site), site) start = time.time() while time.time() - start < 7 * 60: site.refresh() @@ -473,7 +473,7 @@ class BrozzlerWorker: # using brozzler-worker --proxy, nothing to do but try the # same proxy again next time logging.error( - 'proxy error (site.proxy=%s): %s', repr(site.proxy), e) + 'proxy error (site.proxy=%r): %r', site.proxy, e) except: self.logger.critical("unexpected exception", exc_info=True) finally: