mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-23 13:15:29 -04:00
add support for supplying json blob defining site with configuration to brozzler-add-site
This commit is contained in:
parent
6b2ee9faee
commit
e04247c3f7
2 changed files with 38 additions and 12 deletions
|
@ -7,11 +7,16 @@ import sys
|
||||||
import logging
|
import logging
|
||||||
import brozzler
|
import brozzler
|
||||||
import kombu
|
import kombu
|
||||||
|
import json
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
description="brozzler-add-site - register site to crawl with brozzler-hq",
|
description="brozzler-add-site - register site to crawl with brozzler-hq",
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('urls', metavar='URL', nargs='+', help='seed URL(s) of sites to crawl')
|
# epilog="""Examples
|
||||||
|
# brozzler-add-site http://example.com/
|
||||||
|
# brozzler-add-site '{"seed":"http://example.com/","proxy":"localhost:8000","enable_warcprox_features":true,"ignore_robots":true,"time_limit":600}'
|
||||||
|
# """
|
||||||
|
arg_parser.add_argument('sites', metavar='SITE', nargs='+', help='sites to crawl, either a url or a json object')
|
||||||
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
|
arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:guest@localhost:5672/%2f',
|
||||||
help='URL identifying the amqp server to talk to')
|
help='URL identifying the amqp server to talk to')
|
||||||
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
arg_parser.add_argument("-v", "--verbose", dest="log_level",
|
||||||
|
@ -25,7 +30,15 @@ logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||||
|
|
||||||
with kombu.Connection(args.amqp_url) as conn:
|
with kombu.Connection(args.amqp_url) as conn:
|
||||||
q = conn.SimpleQueue("brozzler.sites.new")
|
q = conn.SimpleQueue("brozzler.sites.new")
|
||||||
for url in args.urls:
|
for s in args.sites:
|
||||||
q.put({"seed":url})
|
site = None
|
||||||
|
try:
|
||||||
|
site = brozzler.Site(**json.loads(s))
|
||||||
|
except ValueError:
|
||||||
|
site = brozzler.Site(s)
|
||||||
|
# XXX check url syntax?
|
||||||
|
d = site.to_dict()
|
||||||
|
logging.info("""feeding amqp queue "{}" with {}""".format(q.queue.name, d))
|
||||||
|
q.put(d)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -14,16 +14,18 @@ def robots_url(url):
|
||||||
return hurl.geturl()
|
return hurl.geturl()
|
||||||
|
|
||||||
class RobotFileParser(urllib.robotparser.RobotFileParser):
|
class RobotFileParser(urllib.robotparser.RobotFileParser):
|
||||||
|
"""Adds support for fetching robots.txt through a proxy to
|
||||||
|
urllib.robotparser.RobotFileParser."""
|
||||||
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
"""Adds support for fetching robots.txt through a proxy to
|
|
||||||
urllib.robotparser.RobotFileParser."""
|
|
||||||
def __init__(self, url="", proxy=None):
|
def __init__(self, url="", proxy=None):
|
||||||
super(RobotFileParser, self).__init__(url)
|
super(RobotFileParser, self).__init__(url)
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
|
|
||||||
def read(self):
|
def read(self):
|
||||||
"""Reads the robots.txt URL and feeds it to the parser."""
|
"""Reads the robots.txt URL, perhaps through the configured proxy, and
|
||||||
|
feeds it to the parser."""
|
||||||
try:
|
try:
|
||||||
request = urllib.request.Request(self.url)
|
request = urllib.request.Request(self.url)
|
||||||
if self.proxy:
|
if self.proxy:
|
||||||
|
@ -46,11 +48,14 @@ class RobotFileParser(urllib.robotparser.RobotFileParser):
|
||||||
class Site:
|
class Site:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
def __init__(self, seed, id=None, scope_surt=None, proxy=None, ignore_robots=False):
|
def __init__(self, seed, id=None, scope_surt=None, proxy=None,
|
||||||
|
ignore_robots=False, enable_warcprox_features=False, time_limit=None):
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
self.ignore_robots = ignore_robots
|
self.ignore_robots = ignore_robots
|
||||||
|
self.enable_warcprox_features = enable_warcprox_features
|
||||||
|
self.time_limit = time_limit
|
||||||
|
|
||||||
if scope_surt:
|
if scope_surt:
|
||||||
self.scope_surt = scope_surt
|
self.scope_surt = scope_surt
|
||||||
|
@ -71,7 +76,11 @@ class Site:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
return dict(id=self.id, seed=self.seed, scope_surt=self.scope_surt)
|
d = dict(vars(self))
|
||||||
|
for k in vars(self):
|
||||||
|
if k.startswith("_"):
|
||||||
|
del d[k]
|
||||||
|
return d
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
return json.dumps(self.to_dict(), separators=(',', ':'))
|
return json.dumps(self.to_dict(), separators=(',', ':'))
|
||||||
|
@ -111,14 +120,18 @@ class CrawlUrl:
|
||||||
return self._canon_hurl.geturl()
|
return self._canon_hurl.geturl()
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
|
d = dict(vars(self))
|
||||||
|
|
||||||
|
for k in vars(self):
|
||||||
|
if k.startswith("_"):
|
||||||
|
del d[k]
|
||||||
|
|
||||||
if self.outlinks is not None and not isinstance(self.outlinks, list):
|
if self.outlinks is not None and not isinstance(self.outlinks, list):
|
||||||
outlinks = []
|
outlinks = []
|
||||||
outlinks.extend(self.outlinks)
|
outlinks.extend(self.outlinks)
|
||||||
else:
|
d["outlinks"] = outlinks
|
||||||
outlinks = self.outlinks
|
|
||||||
|
|
||||||
return dict(id=self.id, site_id=self.site_id, url=self.url,
|
return d
|
||||||
hops_from_seed=self.hops_from_seed, outlinks=outlinks)
|
|
||||||
|
|
||||||
def to_json(self):
|
def to_json(self):
|
||||||
return json.dumps(self.to_dict(), separators=(',', ':'))
|
return json.dumps(self.to_dict(), separators=(',', ':'))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue