mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
order the page thumbnails on site page by least number of hops, so the seed shows up first
This commit is contained in:
parent
2038598f41
commit
510456eef2
@ -39,21 +39,43 @@ class RethinkDbFrontier:
|
|||||||
def _ensure_db(self):
|
def _ensure_db(self):
|
||||||
dbs = self.r.db_list().run()
|
dbs = self.r.db_list().run()
|
||||||
if not self.r.dbname in dbs:
|
if not self.r.dbname in dbs:
|
||||||
self.logger.info("creating rethinkdb database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
|
"creating rethinkdb database %s", repr(self.r.dbname))
|
||||||
self.r.db_create(self.r.dbname).run()
|
self.r.db_create(self.r.dbname).run()
|
||||||
tables = self.r.table_list().run()
|
tables = self.r.table_list().run()
|
||||||
if not "sites" in tables:
|
if not "sites" in tables:
|
||||||
self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run()
|
"creating rethinkdb table 'sites' in database %s",
|
||||||
self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["last_disclaimed"]]).run()
|
repr(self.r.dbname))
|
||||||
|
self.r.table_create(
|
||||||
|
"sites", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
self.r.table("sites").index_create(
|
||||||
|
"sites_last_disclaimed", [
|
||||||
|
self.r.row["status"],
|
||||||
|
self.r.row["last_disclaimed"]]).run()
|
||||||
self.r.table("sites").index_create("job_id").run()
|
self.r.table("sites").index_create("job_id").run()
|
||||||
if not "pages" in tables:
|
if not "pages" in tables:
|
||||||
self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run()
|
"creating rethinkdb table 'pages' in database %s",
|
||||||
self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run()
|
repr(self.r.dbname))
|
||||||
|
self.r.table_create(
|
||||||
|
"pages", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
self.r.table("pages").index_create(
|
||||||
|
"priority_by_site", [
|
||||||
|
self.r.row["site_id"], self.r.row["brozzle_count"],
|
||||||
|
self.r.row["claimed"], self.r.row["priority"]]).run()
|
||||||
|
# this index is for displaying pages in a sensible order in the web
|
||||||
|
# console
|
||||||
|
self.r.table("pages").index_create(
|
||||||
|
"least_hops", [
|
||||||
|
r.row["site_id"], r.row["brozzle_count"],
|
||||||
|
r.row["hops_from_seed"]])
|
||||||
if not "jobs" in tables:
|
if not "jobs" in tables:
|
||||||
self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname))
|
self.logger.info(
|
||||||
self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run()
|
"creating rethinkdb table 'jobs' in database %s",
|
||||||
|
repr(self.r.dbname))
|
||||||
|
self.r.table_create(
|
||||||
|
"jobs", shards=self.shards, replicas=self.replicas).run()
|
||||||
|
|
||||||
def _vet_result(self, result, **kwargs):
|
def _vet_result(self, result, **kwargs):
|
||||||
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
# self.logger.debug("vetting expected=%s result=%s", kwargs, result)
|
||||||
|
2
setup.py
2
setup.py
@ -21,7 +21,7 @@ import glob
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1.dev17',
|
version='1.1.dev18',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -24,14 +24,23 @@ import sys
|
|||||||
import os
|
import os
|
||||||
import importlib
|
import importlib
|
||||||
import rethinkdb
|
import rethinkdb
|
||||||
|
import logging
|
||||||
|
|
||||||
# XXX flask does its own logging config
|
# flask does its own logging config
|
||||||
# import logging
|
# logging.basicConfig(
|
||||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
# stream=sys.stdout, level=logging.INFO,
|
||||||
# format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
# format=(
|
||||||
|
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||||
|
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||||
|
|
||||||
app = flask.Flask(__name__)
|
app = flask.Flask(__name__)
|
||||||
|
|
||||||
|
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
|
||||||
|
gunicorn_error_logger = logging.getLogger('gunicorn.error')
|
||||||
|
app.logger.handlers.extend(gunicorn_error_logger.handlers)
|
||||||
|
app.logger.setLevel(logging.INFO)
|
||||||
|
app.logger.info('will this show in the log?')
|
||||||
|
|
||||||
# configure with environment variables
|
# configure with environment variables
|
||||||
SETTINGS = {
|
SETTINGS = {
|
||||||
'RETHINKDB_SERVERS': os.environ.get(
|
'RETHINKDB_SERVERS': os.environ.get(
|
||||||
@ -82,9 +91,8 @@ def pages(site_id):
|
|||||||
start = int(flask.request.args.get("start", 0))
|
start = int(flask.request.args.get("start", 0))
|
||||||
end = int(flask.request.args.get("end", start + 90))
|
end = int(flask.request.args.get("end", start + 90))
|
||||||
pages_ = r.table("pages").between(
|
pages_ = r.table("pages").between(
|
||||||
[site_id, 1, False, r.minval],
|
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||||
[site_id, r.maxval, False, r.maxval],
|
index="least_hops").order_by(index="least_hops")[start:end].run()
|
||||||
index="priority_by_site")[start:end].run()
|
|
||||||
return flask.jsonify(pages=list(pages_))
|
return flask.jsonify(pages=list(pages_))
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>")
|
@app.route("/api/sites/<site_id>")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user