From 3e128d2b271b6bab98cce29ac4aef0f0f6a16be7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 1 Jul 2016 15:23:46 -0500 Subject: [PATCH] option to save list of outlinks (categorized as "accepted", "blocked" (by robots), or "rejected") per page in rethinkdb (to be used by archive-it for out-of-scope reporting) --- brozzler/cli.py | 1 - brozzler/frontier.py | 11 +++++++++++ brozzler/job.py | 7 ++++--- brozzler/site.py | 7 +++++-- setup.py | 2 +- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 972d521..76a33a4 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -163,7 +163,6 @@ def brozzler_new_job(): frontier = brozzler.RethinkDbFrontier(r) brozzler.job.new_job_file(frontier, args.job_conf_file) - def brozzler_new_site(): ''' Command line utility entry point for queuing a new brozzler site. diff --git a/brozzler/frontier.py b/brozzler/frontier.py index c6185b8..afa0858 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -292,6 +292,8 @@ class RethinkDbFrontier: self.update_page(page) def scope_and_schedule_outlinks(self, site, parent_page, outlinks): + if site.remember_outlinks: + parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]} counts = {"added":0,"updated":0,"rejected":0,"blocked":0} for url in outlinks or []: u = brozzler.site.Url(url) @@ -314,10 +316,19 @@ class RethinkDbFrontier: else: self.new_page(new_child_page) counts["added"] += 1 + if site.remember_outlinks: + parent_page.outlinks["accepted"].append(url) else: counts["blocked"] += 1 + if site.remember_outlinks: + parent_page.outlinks["blocked"].append(url) else: counts["rejected"] += 1 + if site.remember_outlinks: + parent_page.outlinks["rejected"].append(url) + + if site.remember_outlinks: + self.update_page(parent_page) self.logger.info( "%s new links added, %s existing links updated, %s links " diff --git a/brozzler/job.py b/brozzler/job.py index fb3d720..bfaef4d 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -54,8 +54,8 @@ def new_job(frontier, job_conf): merged_conf = merge(seed_conf, job_conf) # XXX check for unknown settings, invalid url, etc - site = brozzler.Site(job_id=job.id, - seed=merged_conf["url"], + site = brozzler.Site( + job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), time_limit=merged_conf.get("time_limit"), proxy=merged_conf.get("proxy"), @@ -63,7 +63,8 @@ def new_job(frontier, job_conf): enable_warcprox_features=merged_conf.get( "enable_warcprox_features"), warcprox_meta=merged_conf.get("warcprox_meta"), - metadata=merged_conf.get("metadata")) + metadata=merged_conf.get("metadata"), + remember_outlinks=merged_conf.get("remember_outlinks")) sites.append(site) # insert all the sites into database before the job diff --git a/brozzler/site.py b/brozzler/site.py index 53593de..8167924 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -91,7 +91,7 @@ class Site(brozzler.BaseDictable): enable_warcprox_features=False, reached_limit=None, status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, - last_claimed=_EPOCH_UTC, metadata={}): + last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None): self.seed = seed self.id = id @@ -109,6 +109,7 @@ class Site(brozzler.BaseDictable): self.last_disclaimed = last_disclaimed self.last_claimed = last_claimed self.metadata = metadata + self.remember_outlinks = remember_outlinks self.scope = scope or {} if not "surt" in self.scope: @@ -218,7 +219,8 @@ class Page(brozzler.BaseDictable): def __init__( self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, redirect_url=None, priority=None, claimed=False, brozzle_count=0, - via_page_id=None, last_claimed_by=None, hops_off_surt=0): + via_page_id=None, last_claimed_by=None, hops_off_surt=0, + outlinks=None): self.site_id = site_id self.job_id = job_id self.url = url @@ -229,6 +231,7 @@ class Page(brozzler.BaseDictable): self.brozzle_count = brozzle_count self.via_page_id = via_page_id self.hops_off_surt = hops_off_surt + self.outlinks = outlinks self._canon_hurl = None if priority is not None: diff --git a/setup.py b/setup.py index 7283aa2..ca81d44 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1.dev43', + version='1.1.dev44', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',