order the page thumbnails on site page by least number of hops, so the seed shows up first

This commit is contained in:
Noah Levitt 2016-06-22 21:20:00 +00:00
parent 2038598f41
commit 510456eef2
3 changed files with 47 additions and 17 deletions

View File

@ -39,21 +39,43 @@ class RethinkDbFrontier:
def _ensure_db(self): def _ensure_db(self):
dbs = self.r.db_list().run() dbs = self.r.db_list().run()
if not self.r.dbname in dbs: if not self.r.dbname in dbs:
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname)) self.logger.info(
"creating rethinkdb database %s", repr(self.r.dbname))
self.r.db_create(self.r.dbname).run() self.r.db_create(self.r.dbname).run()
tables = self.r.table_list().run() tables = self.r.table_list().run()
if not "sites" in tables: if not "sites" in tables:
self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname)) self.logger.info(
self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run() "creating rethinkdb table 'sites' in database %s",
self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["last_disclaimed"]]).run() repr(self.r.dbname))
self.r.table_create(
"sites", shards=self.shards, replicas=self.replicas).run()
self.r.table("sites").index_create(
"sites_last_disclaimed", [
self.r.row["status"],
self.r.row["last_disclaimed"]]).run()
self.r.table("sites").index_create("job_id").run() self.r.table("sites").index_create("job_id").run()
if not "pages" in tables: if not "pages" in tables:
self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname)) self.logger.info(
self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run() "creating rethinkdb table 'pages' in database %s",
self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run() repr(self.r.dbname))
self.r.table_create(
"pages", shards=self.shards, replicas=self.replicas).run()
self.r.table("pages").index_create(
"priority_by_site", [
self.r.row["site_id"], self.r.row["brozzle_count"],
self.r.row["claimed"], self.r.row["priority"]]).run()
# this index is for displaying pages in a sensible order in the web
# console
self.r.table("pages").index_create(
"least_hops", [
r.row["site_id"], r.row["brozzle_count"],
r.row["hops_from_seed"]])
if not "jobs" in tables: if not "jobs" in tables:
self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname)) self.logger.info(
self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run() "creating rethinkdb table 'jobs' in database %s",
repr(self.r.dbname))
self.r.table_create(
"jobs", shards=self.shards, replicas=self.replicas).run()
def _vet_result(self, result, **kwargs): def _vet_result(self, result, **kwargs):
# self.logger.debug("vetting expected=%s result=%s", kwargs, result) # self.logger.debug("vetting expected=%s result=%s", kwargs, result)

View File

@ -21,7 +21,7 @@ import glob
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1.dev17', version='1.1.dev18',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View File

@ -24,14 +24,23 @@ import sys
import os import os
import importlib import importlib
import rethinkdb import rethinkdb
import logging
# XXX flask does its own logging config # flask does its own logging config
# import logging # logging.basicConfig(
# logging.basicConfig(stream=sys.stdout, level=logging.INFO, # stream=sys.stdout, level=logging.INFO,
# format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s") # format=(
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
app = flask.Flask(__name__) app = flask.Flask(__name__)
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
gunicorn_error_logger = logging.getLogger('gunicorn.error')
app.logger.handlers.extend(gunicorn_error_logger.handlers)
app.logger.setLevel(logging.INFO)
app.logger.info('will this show in the log?')
# configure with environment variables # configure with environment variables
SETTINGS = { SETTINGS = {
'RETHINKDB_SERVERS': os.environ.get( 'RETHINKDB_SERVERS': os.environ.get(
@ -82,9 +91,8 @@ def pages(site_id):
start = int(flask.request.args.get("start", 0)) start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90)) end = int(flask.request.args.get("end", start + 90))
pages_ = r.table("pages").between( pages_ = r.table("pages").between(
[site_id, 1, False, r.minval], [site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
[site_id, r.maxval, False, r.maxval], index="least_hops").order_by(index="least_hops")[start:end].run()
index="priority_by_site")[start:end].run()
return flask.jsonify(pages=list(pages_)) return flask.jsonify(pages=list(pages_))
@app.route("/api/sites/<site_id>") @app.route("/api/sites/<site_id>")