option to save list of outlinks (categorized as "accepted", "blocked" (by robots), or "rejected") per page in rethinkdb (to be used by archive-it for out-of-scope reporting)

This commit is contained in:
Noah Levitt 2016-07-01 15:23:46 -05:00
parent 01e38ea8c7
commit 3e128d2b27
5 changed files with 21 additions and 7 deletions

View File

@ -163,7 +163,6 @@ def brozzler_new_job():
frontier = brozzler.RethinkDbFrontier(r) frontier = brozzler.RethinkDbFrontier(r)
brozzler.job.new_job_file(frontier, args.job_conf_file) brozzler.job.new_job_file(frontier, args.job_conf_file)
def brozzler_new_site(): def brozzler_new_site():
''' '''
Command line utility entry point for queuing a new brozzler site. Command line utility entry point for queuing a new brozzler site.

View File

@ -292,6 +292,8 @@ class RethinkDbFrontier:
self.update_page(page) self.update_page(page)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
if site.remember_outlinks:
parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]}
counts = {"added":0,"updated":0,"rejected":0,"blocked":0} counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
for url in outlinks or []: for url in outlinks or []:
u = brozzler.site.Url(url) u = brozzler.site.Url(url)
@ -314,10 +316,19 @@ class RethinkDbFrontier:
else: else:
self.new_page(new_child_page) self.new_page(new_child_page)
counts["added"] += 1 counts["added"] += 1
if site.remember_outlinks:
parent_page.outlinks["accepted"].append(url)
else: else:
counts["blocked"] += 1 counts["blocked"] += 1
if site.remember_outlinks:
parent_page.outlinks["blocked"].append(url)
else: else:
counts["rejected"] += 1 counts["rejected"] += 1
if site.remember_outlinks:
parent_page.outlinks["rejected"].append(url)
if site.remember_outlinks:
self.update_page(parent_page)
self.logger.info( self.logger.info(
"%s new links added, %s existing links updated, %s links " "%s new links added, %s existing links updated, %s links "

View File

@ -54,8 +54,8 @@ def new_job(frontier, job_conf):
merged_conf = merge(seed_conf, job_conf) merged_conf = merge(seed_conf, job_conf)
# XXX check for unknown settings, invalid url, etc # XXX check for unknown settings, invalid url, etc
site = brozzler.Site(job_id=job.id, site = brozzler.Site(
seed=merged_conf["url"], job_id=job.id, seed=merged_conf["url"],
scope=merged_conf.get("scope"), scope=merged_conf.get("scope"),
time_limit=merged_conf.get("time_limit"), time_limit=merged_conf.get("time_limit"),
proxy=merged_conf.get("proxy"), proxy=merged_conf.get("proxy"),
@ -63,7 +63,8 @@ def new_job(frontier, job_conf):
enable_warcprox_features=merged_conf.get( enable_warcprox_features=merged_conf.get(
"enable_warcprox_features"), "enable_warcprox_features"),
warcprox_meta=merged_conf.get("warcprox_meta"), warcprox_meta=merged_conf.get("warcprox_meta"),
metadata=merged_conf.get("metadata")) metadata=merged_conf.get("metadata"),
remember_outlinks=merged_conf.get("remember_outlinks"))
sites.append(site) sites.append(site)
# insert all the sites into database before the job # insert all the sites into database before the job

View File

@ -91,7 +91,7 @@ class Site(brozzler.BaseDictable):
enable_warcprox_features=False, reached_limit=None, enable_warcprox_features=False, reached_limit=None,
status="ACTIVE", claimed=False, start_time=None, status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC, metadata={}): last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None):
self.seed = seed self.seed = seed
self.id = id self.id = id
@ -109,6 +109,7 @@ class Site(brozzler.BaseDictable):
self.last_disclaimed = last_disclaimed self.last_disclaimed = last_disclaimed
self.last_claimed = last_claimed self.last_claimed = last_claimed
self.metadata = metadata self.metadata = metadata
self.remember_outlinks = remember_outlinks
self.scope = scope or {} self.scope = scope or {}
if not "surt" in self.scope: if not "surt" in self.scope:
@ -218,7 +219,8 @@ class Page(brozzler.BaseDictable):
def __init__( def __init__(
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
redirect_url=None, priority=None, claimed=False, brozzle_count=0, redirect_url=None, priority=None, claimed=False, brozzle_count=0,
via_page_id=None, last_claimed_by=None, hops_off_surt=0): via_page_id=None, last_claimed_by=None, hops_off_surt=0,
outlinks=None):
self.site_id = site_id self.site_id = site_id
self.job_id = job_id self.job_id = job_id
self.url = url self.url = url
@ -229,6 +231,7 @@ class Page(brozzler.BaseDictable):
self.brozzle_count = brozzle_count self.brozzle_count = brozzle_count
self.via_page_id = via_page_id self.via_page_id = via_page_id
self.hops_off_surt = hops_off_surt self.hops_off_surt = hops_off_surt
self.outlinks = outlinks
self._canon_hurl = None self._canon_hurl = None
if priority is not None: if priority is not None:

View File

@ -21,7 +21,7 @@ import setuptools
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1.dev43', version='1.1.dev44',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',