mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
option to save list of outlinks (categorized as "accepted", "blocked" (by robots), or "rejected") per page in rethinkdb (to be used by archive-it for out-of-scope reporting)
This commit is contained in:
parent
01e38ea8c7
commit
3e128d2b27
@ -163,7 +163,6 @@ def brozzler_new_job():
|
|||||||
frontier = brozzler.RethinkDbFrontier(r)
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
||||||
|
|
||||||
|
|
||||||
def brozzler_new_site():
|
def brozzler_new_site():
|
||||||
'''
|
'''
|
||||||
Command line utility entry point for queuing a new brozzler site.
|
Command line utility entry point for queuing a new brozzler site.
|
||||||
|
@ -292,6 +292,8 @@ class RethinkDbFrontier:
|
|||||||
self.update_page(page)
|
self.update_page(page)
|
||||||
|
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
|
if site.remember_outlinks:
|
||||||
|
parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]}
|
||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
u = brozzler.site.Url(url)
|
u = brozzler.site.Url(url)
|
||||||
@ -314,10 +316,19 @@ class RethinkDbFrontier:
|
|||||||
else:
|
else:
|
||||||
self.new_page(new_child_page)
|
self.new_page(new_child_page)
|
||||||
counts["added"] += 1
|
counts["added"] += 1
|
||||||
|
if site.remember_outlinks:
|
||||||
|
parent_page.outlinks["accepted"].append(url)
|
||||||
else:
|
else:
|
||||||
counts["blocked"] += 1
|
counts["blocked"] += 1
|
||||||
|
if site.remember_outlinks:
|
||||||
|
parent_page.outlinks["blocked"].append(url)
|
||||||
else:
|
else:
|
||||||
counts["rejected"] += 1
|
counts["rejected"] += 1
|
||||||
|
if site.remember_outlinks:
|
||||||
|
parent_page.outlinks["rejected"].append(url)
|
||||||
|
|
||||||
|
if site.remember_outlinks:
|
||||||
|
self.update_page(parent_page)
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"%s new links added, %s existing links updated, %s links "
|
"%s new links added, %s existing links updated, %s links "
|
||||||
|
@ -54,8 +54,8 @@ def new_job(frontier, job_conf):
|
|||||||
merged_conf = merge(seed_conf, job_conf)
|
merged_conf = merge(seed_conf, job_conf)
|
||||||
# XXX check for unknown settings, invalid url, etc
|
# XXX check for unknown settings, invalid url, etc
|
||||||
|
|
||||||
site = brozzler.Site(job_id=job.id,
|
site = brozzler.Site(
|
||||||
seed=merged_conf["url"],
|
job_id=job.id, seed=merged_conf["url"],
|
||||||
scope=merged_conf.get("scope"),
|
scope=merged_conf.get("scope"),
|
||||||
time_limit=merged_conf.get("time_limit"),
|
time_limit=merged_conf.get("time_limit"),
|
||||||
proxy=merged_conf.get("proxy"),
|
proxy=merged_conf.get("proxy"),
|
||||||
@ -63,7 +63,8 @@ def new_job(frontier, job_conf):
|
|||||||
enable_warcprox_features=merged_conf.get(
|
enable_warcprox_features=merged_conf.get(
|
||||||
"enable_warcprox_features"),
|
"enable_warcprox_features"),
|
||||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||||
metadata=merged_conf.get("metadata"))
|
metadata=merged_conf.get("metadata"),
|
||||||
|
remember_outlinks=merged_conf.get("remember_outlinks"))
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
|
|
||||||
# insert all the sites into database before the job
|
# insert all the sites into database before the job
|
||||||
|
@ -91,7 +91,7 @@ class Site(brozzler.BaseDictable):
|
|||||||
enable_warcprox_features=False, reached_limit=None,
|
enable_warcprox_features=False, reached_limit=None,
|
||||||
status="ACTIVE", claimed=False, start_time=None,
|
status="ACTIVE", claimed=False, start_time=None,
|
||||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||||
last_claimed=_EPOCH_UTC, metadata={}):
|
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None):
|
||||||
|
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
@ -109,6 +109,7 @@ class Site(brozzler.BaseDictable):
|
|||||||
self.last_disclaimed = last_disclaimed
|
self.last_disclaimed = last_disclaimed
|
||||||
self.last_claimed = last_claimed
|
self.last_claimed = last_claimed
|
||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
|
self.remember_outlinks = remember_outlinks
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
@ -218,7 +219,8 @@ class Page(brozzler.BaseDictable):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
|
||||||
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
|
||||||
via_page_id=None, last_claimed_by=None, hops_off_surt=0):
|
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
|
||||||
|
outlinks=None):
|
||||||
self.site_id = site_id
|
self.site_id = site_id
|
||||||
self.job_id = job_id
|
self.job_id = job_id
|
||||||
self.url = url
|
self.url = url
|
||||||
@ -229,6 +231,7 @@ class Page(brozzler.BaseDictable):
|
|||||||
self.brozzle_count = brozzle_count
|
self.brozzle_count = brozzle_count
|
||||||
self.via_page_id = via_page_id
|
self.via_page_id = via_page_id
|
||||||
self.hops_off_surt = hops_off_surt
|
self.hops_off_surt = hops_off_surt
|
||||||
|
self.outlinks = outlinks
|
||||||
self._canon_hurl = None
|
self._canon_hurl = None
|
||||||
|
|
||||||
if priority is not None:
|
if priority is not None:
|
||||||
|
2
setup.py
2
setup.py
@ -21,7 +21,7 @@ import setuptools
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1.dev43',
|
version='1.1.dev44',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user