mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
order the page thumbnails on site page by least number of hops, so the seed shows up first
This commit is contained in:
parent
2038598f41
commit
510456eef2
@ -39,21 +39,43 @@ class RethinkDbFrontier:
|
||||
def _ensure_db(self):
|
||||
dbs = self.r.db_list().run()
|
||||
if not self.r.dbname in dbs:
|
||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
||||
self.logger.info(
|
||||
"creating rethinkdb database %s", repr(self.r.dbname))
|
||||
self.r.db_create(self.r.dbname).run()
|
||||
tables = self.r.table_list().run()
|
||||
if not "sites" in tables:
|
||||
self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname))
|
||||
self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["last_disclaimed"]]).run()
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'sites' in database %s",
|
||||
repr(self.r.dbname))
|
||||
self.r.table_create(
|
||||
"sites", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("sites").index_create(
|
||||
"sites_last_disclaimed", [
|
||||
self.r.row["status"],
|
||||
self.r.row["last_disclaimed"]]).run()
|
||||
self.r.table("sites").index_create("job_id").run()
|
||||
if not "pages" in tables:
|
||||
self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname))
|
||||
self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run()
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'pages' in database %s",
|
||||
repr(self.r.dbname))
|
||||
self.r.table_create(
|
||||
"pages", shards=self.shards, replicas=self.replicas).run()
|
||||
self.r.table("pages").index_create(
|
||||
"priority_by_site", [
|
||||
self.r.row["site_id"], self.r.row["brozzle_count"],
|
||||
self.r.row["claimed"], self.r.row["priority"]]).run()
|
||||
# this index is for displaying pages in a sensible order in the web
|
||||
# console
|
||||
self.r.table("pages").index_create(
|
||||
"least_hops", [
|
||||
r.row["site_id"], r.row["brozzle_count"],
|
||||
r.row["hops_from_seed"]])
|
||||
if not "jobs" in tables:
|
||||
self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname))
|
||||
self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run()
|
||||
self.logger.info(
|
||||
"creating rethinkdb table 'jobs' in database %s",
|
||||
repr(self.r.dbname))
|
||||
self.r.table_create(
|
||||
"jobs", shards=self.shards, replicas=self.replicas).run()
|
||||
|
||||
def _vet_result(self, result, **kwargs):
|
||||
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
||||
|
2
setup.py
2
setup.py
@ -21,7 +21,7 @@ import glob
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1.dev17',
|
||||
version='1.1.dev18',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -24,14 +24,23 @@ import sys
|
||||
import os
|
||||
import importlib
|
||||
import rethinkdb
|
||||
import logging
|
||||
|
||||
# XXX flask does its own logging config
|
||||
# import logging
|
||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
||||
# format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||
# flask does its own logging config
|
||||
# logging.basicConfig(
|
||||
# stream=sys.stdout, level=logging.INFO,
|
||||
# format=(
|
||||
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||
|
||||
app = flask.Flask(__name__)
|
||||
|
||||
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
|
||||
gunicorn_error_logger = logging.getLogger('gunicorn.error')
|
||||
app.logger.handlers.extend(gunicorn_error_logger.handlers)
|
||||
app.logger.setLevel(logging.INFO)
|
||||
app.logger.info('will this show in the log?')
|
||||
|
||||
# configure with environment variables
|
||||
SETTINGS = {
|
||||
'RETHINKDB_SERVERS': os.environ.get(
|
||||
@ -82,9 +91,8 @@ def pages(site_id):
|
||||
start = int(flask.request.args.get("start", 0))
|
||||
end = int(flask.request.args.get("end", start + 90))
|
||||
pages_ = r.table("pages").between(
|
||||
[site_id, 1, False, r.minval],
|
||||
[site_id, r.maxval, False, r.maxval],
|
||||
index="priority_by_site")[start:end].run()
|
||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||
index="least_hops").order_by(index="least_hops")[start:end].run()
|
||||
return flask.jsonify(pages=list(pages_))
|
||||
|
||||
@app.route("/api/sites/<site_id>")
|
||||
|
Loading…
x
Reference in New Issue
Block a user