mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'qa' of github.com:internetarchive/brozzler into qa
This commit is contained in:
commit
0ff6257bef
53
README.rst
53
README.rst
@ -1,10 +1,10 @@
|
||||
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
|
||||
:target: https://travis-ci.org/internetarchive/brozzler
|
||||
|
||||
|
||||
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
|
||||
:width: 7%
|
||||
|
||||
|logo| brozzler
|
||||
|logo| brozzler
|
||||
===============
|
||||
"browser" \| "crawler" = "brozzler"
|
||||
|
||||
@ -97,9 +97,8 @@ Job Configuration
|
||||
-----------------
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. A job id and at least one seed url
|
||||
must be specified, everything else is optional. For details, see
|
||||
`<job-conf.rst>`_.
|
||||
top-level or on individual seeds. At least one seed url must be specified,
|
||||
everything else is optional. For details, see `<job-conf.rst>`_.
|
||||
|
||||
::
|
||||
|
||||
@ -139,6 +138,46 @@ To start the app, run
|
||||
|
||||
See ``brozzler-dashboard --help`` for configuration options.
|
||||
|
||||
Brozzler Wayback
|
||||
----------------
|
||||
|
||||
Brozzler comes with a customized version of
|
||||
`pywb <https://github.com/ikreymer/pywb>`_ which supports using the rethinkdb
|
||||
"captures" table (populated by warcprox) as its index.
|
||||
|
||||
To use, first install dependencies.
|
||||
|
||||
::
|
||||
|
||||
pip install brozzler[easy]
|
||||
|
||||
Write a configuration file pywb.yml.
|
||||
|
||||
::
|
||||
|
||||
# 'archive_paths' should point to the output directory of warcprox
|
||||
archive_paths: warcs/ # pywb will fail without a trailing slash
|
||||
collections:
|
||||
brozzler:
|
||||
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
||||
db: brozzler
|
||||
table: captures
|
||||
servers:
|
||||
- localhost
|
||||
enable_auto_colls: false
|
||||
enable_cdx_api: true
|
||||
framed_replay: true
|
||||
port: 8880
|
||||
|
||||
Run pywb like so:
|
||||
|
||||
::
|
||||
|
||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||
|
||||
Then browse http://localhost:8880/brozzler/.
|
||||
|
||||
|
||||
Headless Chromium
|
||||
-----------------
|
||||
|
||||
@ -198,7 +237,7 @@ option:
|
||||
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
|
||||
|
||||
To render Flash content, `download <https://get.adobe.com/flashplayer/otherversions/>`_
|
||||
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
|
||||
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
|
||||
to load the plugin by adding this option to your wrapper script:
|
||||
|
||||
::
|
||||
@ -208,7 +247,7 @@ to load the plugin by adding this option to your wrapper script:
|
||||
License
|
||||
-------
|
||||
|
||||
Copyright 2015-2016 Internet Archive
|
||||
Copyright 2015-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
not use this software except in compliance with the License. You may
|
||||
|
@ -64,19 +64,33 @@ class BaseDictable:
|
||||
def __repr__(self):
|
||||
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
||||
|
||||
def fixup(url):
|
||||
def fixup(url, hash_strip=False):
|
||||
'''
|
||||
Does rudimentary canonicalization, such as converting IDN to punycode.
|
||||
'''
|
||||
import surt
|
||||
hurl = surt.handyurl.parse(url)
|
||||
if hash_strip:
|
||||
hurl.hash = None
|
||||
# handyurl.parse() already lowercases the scheme via urlsplit
|
||||
if hurl.host:
|
||||
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
|
||||
return hurl.getURLString()
|
||||
|
||||
# logging level more fine-grained than logging.DEBUG==10
|
||||
# monkey-patch log level TRACE
|
||||
TRACE = 5
|
||||
import logging as _logging
|
||||
def _logging_trace(msg, *args, **kwargs):
|
||||
if len(_logging.root.handlers) == 0:
|
||||
basicConfig()
|
||||
_logging.root.trace(msg, *args, **kwargs)
|
||||
def _logger_trace(self, msg, *args, **kwargs):
|
||||
if self.isEnabledFor(TRACE):
|
||||
self._log(TRACE, msg, args, **kwargs)
|
||||
_logging.trace = _logging_trace
|
||||
_logging.Logger.trace = _logger_trace
|
||||
_logging._levelToName[TRACE] = 'TRACE'
|
||||
_logging._nameToLevel['TRACE'] = TRACE
|
||||
|
||||
_behaviors = None
|
||||
def behaviors():
|
||||
|
@ -200,8 +200,7 @@ class WebsockReceiverThread(threading.Thread):
|
||||
else:
|
||||
self.logger.info(
|
||||
'reached limit but self.reached_limit is already set, '
|
||||
'assuming the calling thread is already handling this',
|
||||
self.reached_limit)
|
||||
'assuming the calling thread is already handling this')
|
||||
if self.on_response:
|
||||
self.on_response(message)
|
||||
|
||||
|
@ -231,11 +231,11 @@ class Chrome:
|
||||
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
||||
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
||||
b'ERROR:gpu_child_thread.cc', buf):
|
||||
logging.log(
|
||||
brozzler.TRACE, 'chrome pid %s STDOUT %s',
|
||||
self.logger.trace(
|
||||
'chrome pid %s STDOUT %s',
|
||||
self.chrome_process.pid, buf)
|
||||
else:
|
||||
logging.debug(
|
||||
self.logger.debug(
|
||||
'chrome pid %s STDOUT %s',
|
||||
self.chrome_process.pid, buf)
|
||||
|
||||
@ -246,15 +246,15 @@ class Chrome:
|
||||
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
||||
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
||||
b'ERROR:gpu_child_thread.cc', buf):
|
||||
logging.log(
|
||||
brozzler.TRACE, 'chrome pid %s STDOUT %s',
|
||||
self.logger.trace(
|
||||
'chrome pid %s STDOUT %s',
|
||||
self.chrome_process.pid, buf)
|
||||
else:
|
||||
logging.debug(
|
||||
self.logger.debug(
|
||||
'chrome pid %s STDERR %s',
|
||||
self.chrome_process.pid, buf)
|
||||
except:
|
||||
logging.error('unexpected exception', exc_info=True)
|
||||
self.logger.error('unexpected exception', exc_info=True)
|
||||
|
||||
def stop(self):
|
||||
if not self.chrome_process or self._shutdown.is_set():
|
||||
|
@ -91,8 +91,7 @@ def _add_proxy_options(arg_parser):
|
||||
|
||||
def configure_logging(args):
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=args.log_level,
|
||||
format=(
|
||||
stream=sys.stderr, level=args.log_level, format=(
|
||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARN)
|
||||
@ -285,7 +284,7 @@ def brozzler_new_site():
|
||||
args.behavior_parameters) if args.behavior_parameters else None,
|
||||
username=args.username, password=args.password)
|
||||
|
||||
r = rethinker()
|
||||
r = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
|
@ -149,6 +149,7 @@ class BrozzlerEasyController:
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
||||
brozzler.pywb.support_in_progress_warcs()
|
||||
brozzler.pywb.monkey_patch_wburl()
|
||||
|
||||
if args.warcs_dir.endswith('/'):
|
||||
warcs_dir = args.warcs_dir
|
||||
|
@ -1,7 +1,7 @@
|
||||
'''
|
||||
brozzler/frontier.py - RethinkDbFrontier manages crawl jobs, sites and pages
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -99,6 +99,7 @@ class RethinkDbFrontier:
|
||||
if not job.id:
|
||||
# only if "id" has not already been set
|
||||
job.id = result["generated_keys"][0]
|
||||
return job
|
||||
|
||||
def new_site(self, site):
|
||||
self.logger.info("inserting into 'sites' table %s", site)
|
||||
@ -119,7 +120,7 @@ class RethinkDbFrontier:
|
||||
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
|
||||
|
||||
def update_page(self, page):
|
||||
self.logger.debug("updating 'pages' table entry %s", page)
|
||||
self.logger.trace("updating 'pages' table entry %s", page)
|
||||
result = self.r.table("pages").get(page.id).replace(page.to_dict()).run()
|
||||
self._vet_result(result, replaced=[0,1], unchanged=[0,1])
|
||||
|
||||
@ -176,9 +177,10 @@ class RethinkDbFrontier:
|
||||
|
||||
def _enforce_time_limit(self, site):
|
||||
if (site.time_limit and site.time_limit > 0
|
||||
and (rethinkstuff.utcnow() - site.start_time).total_seconds() > site.time_limit):
|
||||
self.logger.debug("site FINISHED_TIME_LIMIT! time_limit=%s start_time=%s elapsed=%s %s",
|
||||
site.time_limit, site.start_time, rethinkstuff.utcnow() - site.start_time, site)
|
||||
and site.elapsed() > site.time_limit):
|
||||
self.logger.debug(
|
||||
"site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s",
|
||||
site.time_limit, site.elapsed(), site)
|
||||
self.finished(site, "FINISHED_TIME_LIMIT")
|
||||
return True
|
||||
else:
|
||||
@ -276,14 +278,16 @@ class RethinkDbFrontier:
|
||||
n += 1
|
||||
|
||||
self.logger.info("all %s sites finished, job %s is FINISHED!", n, job.id)
|
||||
job.status = "FINISHED"
|
||||
job.finished = rethinkstuff.utcnow()
|
||||
job.finish()
|
||||
self.update_job(job)
|
||||
return True
|
||||
|
||||
def finished(self, site, status):
|
||||
self.logger.info("%s %s", status, site)
|
||||
site.status = status
|
||||
site.claimed = False
|
||||
site.last_disclaimed = rethinkstuff.utcnow()
|
||||
site.starts_and_stops[-1]["stop"] = rethinkstuff.utcnow()
|
||||
self.update_site(site)
|
||||
if site.job_id:
|
||||
self._maybe_finish_job(site.job_id)
|
||||
@ -300,6 +304,30 @@ class RethinkDbFrontier:
|
||||
page.claimed = False
|
||||
self.update_page(page)
|
||||
|
||||
def resume_job(self, job):
|
||||
job.status = "ACTIVE"
|
||||
job.starts_and_stops.append(
|
||||
{"start":rethinkstuff.utcnow(), "stop":None})
|
||||
self.update_job(job)
|
||||
for site in self.job_sites(job.id):
|
||||
site.status = "ACTIVE"
|
||||
site.starts_and_stops.append(
|
||||
{"start":rethinkstuff.utcnow(), "stop":None})
|
||||
self.update_site(site)
|
||||
|
||||
def resume_site(self, site):
|
||||
if site.job_id:
|
||||
# can't call resume_job since that would resume jobs's other sites
|
||||
job = self.job(site.job_id)
|
||||
job.status = "ACTIVE"
|
||||
job.starts_and_stops.append(
|
||||
{"start":rethinkstuff.utcnow(), "stop":None})
|
||||
self.update_job(job)
|
||||
site.status = "ACTIVE"
|
||||
site.starts_and_stops.append(
|
||||
{"start":rethinkstuff.utcnow(), "stop":None})
|
||||
self.update_site(site)
|
||||
|
||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||
if site.remember_outlinks:
|
||||
parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]}
|
||||
|
@ -2,7 +2,7 @@
|
||||
brozzler/job.py - Job class representing a brozzler crawl job, and functions
|
||||
for setting up a job with supplied configuration
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -61,17 +61,22 @@ def merge(a, b):
|
||||
return a
|
||||
|
||||
def new_job_file(frontier, job_conf_file):
|
||||
'''Returns new Job.'''
|
||||
logging.info("loading %s", job_conf_file)
|
||||
with open(job_conf_file) as f:
|
||||
job_conf = yaml.load(f)
|
||||
new_job(frontier, job_conf)
|
||||
return new_job(frontier, job_conf)
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
'''Returns new Job.'''
|
||||
validate_conf(job_conf)
|
||||
job = Job(
|
||||
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||
started=rethinkstuff.utcnow())
|
||||
|
||||
# insert the job now to make sure it has an id
|
||||
job = frontier.new_job(job)
|
||||
|
||||
sites = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
@ -92,11 +97,10 @@ def new_job(frontier, job_conf):
|
||||
password=merged_conf.get("password"))
|
||||
sites.append(site)
|
||||
|
||||
# insert all the sites into database before the job
|
||||
for site in sites:
|
||||
new_site(frontier, site)
|
||||
|
||||
frontier.new_job(job)
|
||||
return job
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
@ -120,14 +124,29 @@ def new_site(frontier, site):
|
||||
class Job(brozzler.BaseDictable):
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
|
||||
def __init__(self, id=None, conf=None, status="ACTIVE", started=None,
|
||||
finished=None, stop_requested=None):
|
||||
def __init__(
|
||||
self, id=None, conf=None, status="ACTIVE", started=None,
|
||||
finished=None, stop_requested=None, starts_and_stops=None):
|
||||
self.id = id
|
||||
self.conf = conf
|
||||
self.status = status
|
||||
self.started = started
|
||||
self.finished = finished
|
||||
self.stop_requested = stop_requested
|
||||
self.starts_and_stops = starts_and_stops
|
||||
if not self.starts_and_stops:
|
||||
if started: # backward compatibility
|
||||
self.starts_and_stops = [{"start":started,"stop":finished}]
|
||||
else:
|
||||
self.starts_and_stops = [
|
||||
{"start":rethinkstuff.utcnow(),"stop":None}]
|
||||
|
||||
def finish(self):
|
||||
if self.status == "FINISHED" or self.starts_and_stops[-1]["stop"]:
|
||||
self.logger.error(
|
||||
"job is already finished status=%s "
|
||||
"starts_and_stops[-1]['stop']=%s", self.status,
|
||||
self.starts_and_stops[-1]["stop"])
|
||||
self.status = "FINISHED"
|
||||
self.starts_and_stops[-1]["stop"] = rethinkstuff.utcnow()
|
||||
|
||||
def __str__(self):
|
||||
return 'Job(id=%s)' % self.id
|
||||
|
@ -2,7 +2,7 @@ id:
|
||||
type:
|
||||
- string
|
||||
- integer
|
||||
required: true
|
||||
required: false
|
||||
|
||||
<<: &multi_level_options
|
||||
time_limit:
|
||||
|
@ -1,9 +1,9 @@
|
||||
'''
|
||||
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
|
||||
loading from warcs still being written to, and canonicalization rules matching
|
||||
brozzler conventions
|
||||
loading from warcs still being written to, canonicalization rules matching
|
||||
brozzler conventions, support for screenshot: and thumbnail: urls
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
Copyright (C) 2016-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -26,6 +26,8 @@ try:
|
||||
import pywb.cdx.cdxobject
|
||||
import pywb.cdx.cdxserver
|
||||
import pywb.webapp.query_handler
|
||||
import pywb.framework.basehandlers
|
||||
import pywb.rewrite.wburl
|
||||
except ImportError as e:
|
||||
logging.critical(
|
||||
'%s: %s\n\nYou might need to run "pip install '
|
||||
@ -37,6 +39,7 @@ import rethinkdb
|
||||
import surt
|
||||
import json
|
||||
import brozzler
|
||||
import argparse
|
||||
|
||||
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
def __init__(self, servers, db, table):
|
||||
@ -65,7 +68,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
'url': record['url'],
|
||||
'status': str(record['response_code']),
|
||||
'digest': record['sha1base32'],
|
||||
'length': str(record['record_length']),
|
||||
'length': str(record.get('record_length', '-')),
|
||||
'offset': str(record['offset']),
|
||||
'filename': record['filename'],
|
||||
}
|
||||
@ -120,8 +123,7 @@ class TheGoodUrlCanonicalizer(object):
|
||||
# logging.debug('%s -> %s', url, key)
|
||||
return key
|
||||
except Exception as e:
|
||||
raise pywb.utils.canonicalize.UrlCanonicalizeException(
|
||||
'Invalid Url: ' + url)
|
||||
return url
|
||||
|
||||
def replace_default_canonicalizer():
|
||||
'''Replace parent class of CustomUrlCanonicalizer with this class.'''
|
||||
@ -193,11 +195,90 @@ def support_in_progress_warcs():
|
||||
return results
|
||||
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||
|
||||
class SomeWbUrl(pywb.rewrite.wburl.WbUrl):
|
||||
def __init__(self, orig_url):
|
||||
import re
|
||||
import six
|
||||
|
||||
from six.moves.urllib.parse import urlsplit, urlunsplit
|
||||
from six.moves.urllib.parse import quote_plus, quote, unquote_plus
|
||||
|
||||
from pywb.utils.loaders import to_native_str
|
||||
from pywb.rewrite.wburl import WbUrl
|
||||
|
||||
pywb.rewrite.wburl.BaseWbUrl.__init__(self)
|
||||
|
||||
if six.PY2 and isinstance(orig_url, six.text_type):
|
||||
orig_url = orig_url.encode('utf-8')
|
||||
orig_url = quote(orig_url)
|
||||
|
||||
self._original_url = orig_url
|
||||
|
||||
if not self._init_query(orig_url):
|
||||
if not self._init_replay(orig_url):
|
||||
raise Exception('Invalid WbUrl: ', orig_url)
|
||||
|
||||
new_uri = WbUrl.to_uri(self.url)
|
||||
|
||||
self._do_percent_encode = True
|
||||
|
||||
self.url = new_uri
|
||||
|
||||
# begin brozzler changes
|
||||
if (self.url.startswith('urn:') or self.url.startswith('screenshot:')
|
||||
or self.url.startswith('thumbnail:')):
|
||||
return
|
||||
# end brozzler changes
|
||||
|
||||
# protocol agnostic url -> http://
|
||||
# no protocol -> http://
|
||||
#inx = self.url.find('://')
|
||||
inx = -1
|
||||
m = self.SCHEME_RX.match(self.url)
|
||||
if m:
|
||||
inx = m.span(1)[0]
|
||||
|
||||
#if inx < 0:
|
||||
# check for other partially encoded variants
|
||||
# m = self.PARTIAL_ENC_RX.match(self.url)
|
||||
# if m:
|
||||
# len_ = len(m.group(0))
|
||||
# self.url = (urllib.unquote_plus(self.url[:len_]) +
|
||||
# self.url[len_:])
|
||||
# inx = self.url.find(':/')
|
||||
|
||||
if inx < 0:
|
||||
self.url = self.DEFAULT_SCHEME + self.url
|
||||
else:
|
||||
inx += 2
|
||||
if inx < len(self.url) and self.url[inx] != '/':
|
||||
self.url = self.url[:inx] + '/' + self.url[inx:]
|
||||
|
||||
def _get_wburl_type(self):
|
||||
return SomeWbUrl
|
||||
|
||||
def monkey_patch_wburl():
|
||||
pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type
|
||||
|
||||
class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli):
|
||||
def _extend_parser(self, arg_parser):
|
||||
super()._extend_parser(arg_parser)
|
||||
arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex
|
||||
arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter
|
||||
arg_parser.epilog = '''
|
||||
Run pywb like so:
|
||||
|
||||
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
|
||||
|
||||
See README.rst for more information.
|
||||
'''
|
||||
|
||||
def main(argv=sys.argv):
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
||||
brozzler.pywb.support_in_progress_warcs()
|
||||
wayback_cli = pywb.apps.cli.WaybackCli(
|
||||
brozzler.pywb.monkey_patch_wburl()
|
||||
wayback_cli = BrozzlerWaybackCli(
|
||||
args=argv[1:], default_port=8880,
|
||||
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
|
||||
'with brozzler)'))
|
||||
|
@ -99,7 +99,7 @@ class Site(brozzler.BaseDictable):
|
||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
||||
cookie_db=None, user_agent=None, behavior_parameters=None,
|
||||
username=None, password=None):
|
||||
username=None, password=None, starts_and_stops=None):
|
||||
|
||||
self.seed = seed
|
||||
self.id = id
|
||||
@ -113,7 +113,6 @@ class Site(brozzler.BaseDictable):
|
||||
self.status = status
|
||||
self.claimed = bool(claimed)
|
||||
self.last_claimed_by = last_claimed_by
|
||||
self.start_time = start_time or rethinkstuff.utcnow()
|
||||
self.last_disclaimed = last_disclaimed
|
||||
self.last_claimed = last_claimed
|
||||
self.metadata = metadata
|
||||
@ -123,11 +122,32 @@ class Site(brozzler.BaseDictable):
|
||||
self.behavior_parameters = behavior_parameters
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.starts_and_stops = starts_and_stops
|
||||
if not self.starts_and_stops:
|
||||
if start_time: # backward compatibility
|
||||
self.starts_and_stops = [{"start":start_time,"stop":None}]
|
||||
if self.status != "ACTIVE":
|
||||
self.starts_and_stops[0]["stop"] = self.last_disclaimed
|
||||
else:
|
||||
self.starts_and_stops = [
|
||||
{"start":rethinkstuff.utcnow(),"stop":None}]
|
||||
|
||||
self.scope = scope or {}
|
||||
if not "surt" in self.scope:
|
||||
self.scope["surt"] = Url(seed).surt
|
||||
|
||||
def elapsed(self):
|
||||
'''Returns elapsed crawl time as a float in seconds.'''
|
||||
dt = 0
|
||||
for ss in self.starts_and_stops[:-1]:
|
||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
||||
ss = self.starts_and_stops[-1]
|
||||
if ss['stop']:
|
||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
||||
else: # crawl is active
|
||||
dt += (rethinkstuff.utcnow() - ss['start']).total_seconds()
|
||||
return dt
|
||||
|
||||
def __str__(self):
|
||||
return "Site-%s-%s" % (self.id, self.seed)
|
||||
|
||||
|
@ -237,17 +237,20 @@ class BrozzlerWorker:
|
||||
if on_screenshot:
|
||||
on_screenshot(screenshot_png)
|
||||
elif self._proxy(site) and self._enable_warcprox_features(site):
|
||||
self.logger.info("sending WARCPROX_WRITE_RECORD request "
|
||||
"to warcprox with screenshot for %s", page)
|
||||
self.logger.info(
|
||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||
"screenshot for %s", self._proxy(site), page)
|
||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||
screenshot_png)
|
||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
||||
url="screenshot:%s" % brozzler.fixup(page.url),
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy(site),
|
||||
url="screenshot:%s" % brozzler.fixup(page.url, True),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=screenshot_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
||||
url="thumbnail:%s" % brozzler.fixup(page.url),
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy(site),
|
||||
url="thumbnail:%s" % brozzler.fixup(page.url, True),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
@ -380,9 +383,8 @@ class BrozzlerWorker:
|
||||
|
||||
try:
|
||||
self.status_info = self._service_registry.heartbeat(status_info)
|
||||
self.logger.log(
|
||||
brozzler.TRACE, "status in service registry: %s",
|
||||
self.status_info)
|
||||
self.logger.trace(
|
||||
"status in service registry: %s", self.status_info)
|
||||
except rethinkdb.ReqlError as e:
|
||||
self.logger.error(
|
||||
"failed to send heartbeat and update service registry "
|
||||
|
14
job-conf.rst
14
job-conf.rst
@ -2,8 +2,8 @@ brozzler job configuration
|
||||
**************************
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. A job id and at least one seed url
|
||||
must be specified, everything else is optional.
|
||||
top-level or on individual seeds. At least one seed url must be specified,
|
||||
everything else is optional.
|
||||
|
||||
an example
|
||||
==========
|
||||
@ -85,11 +85,11 @@ settings reference
|
||||
|
||||
id
|
||||
--
|
||||
+-----------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+=========+
|
||||
| top-level | string | yes? | *n/a* |
|
||||
+-----------+--------+----------+---------+
|
||||
+-----------+--------+----------+--------------------------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+==========================+
|
||||
| top-level | string | no | *generated by rethinkdb* |
|
||||
+-----------+--------+----------+--------------------------+
|
||||
An arbitrary identifier for this job. Must be unique across this deployment of
|
||||
brozzler.
|
||||
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev176',
|
||||
version='1.1b9.dev185',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -155,6 +155,20 @@ def test_brozzle_site(httpd):
|
||||
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
|
||||
assert requests.get(wb_url).content == expected_payload
|
||||
|
||||
url = 'screenshot:%s' % page1
|
||||
t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
|
||||
response = requests.get(wb_url)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['content-type'] == 'image/jpeg'
|
||||
|
||||
url = 'thumbnail:%s' % page1
|
||||
t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url)
|
||||
response = requests.get(wb_url)
|
||||
assert response.status_code == 200
|
||||
assert response.headers['content-type'] == 'image/jpeg'
|
||||
|
||||
def test_warcprox_selection(httpd):
|
||||
''' When enable_warcprox_features is true, brozzler is expected to choose
|
||||
and instance of warcprox '''
|
||||
|
174
tests/test_frontier.py
Normal file
174
tests/test_frontier.py
Normal file
@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
test_frontier.py - fairly narrow tests of frontier management, requires
|
||||
rethinkdb running on localhost
|
||||
|
||||
Copyright (C) 2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import brozzler
|
||||
import logging
|
||||
import argparse
|
||||
import rethinkstuff
|
||||
import time
|
||||
|
||||
args = argparse.Namespace()
|
||||
args.log_level = logging.INFO
|
||||
brozzler.cli.configure_logging(args)
|
||||
|
||||
def test_rethinkdb_up():
|
||||
'''Checks that rethinkdb is listening and looks sane.'''
|
||||
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
|
||||
tbls = r.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
def test_resume_job():
|
||||
'''
|
||||
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
||||
"finish" crawling a job. Doesn't actually crawl anything.
|
||||
'''
|
||||
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
||||
r = rethinkstuff.Rethinker(db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
job_conf = {'seeds': [{'url': 'http://example.com/'}]}
|
||||
job = brozzler.new_job(frontier, job_conf)
|
||||
assert len(list(frontier.job_sites(job.id))) == 1
|
||||
site = list(frontier.job_sites(job.id))[0]
|
||||
|
||||
assert job.status == 'ACTIVE'
|
||||
assert len(job.starts_and_stops) == 1
|
||||
assert job.starts_and_stops[0]['start']
|
||||
assert job.starts_and_stops[0]['stop'] is None
|
||||
assert site.status == 'ACTIVE'
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start']
|
||||
assert site.starts_and_stops[0]['stop'] is None
|
||||
|
||||
frontier.finished(site, 'FINISHED')
|
||||
job = frontier.job(job.id)
|
||||
|
||||
assert job.status == 'FINISHED'
|
||||
assert len(job.starts_and_stops) == 1
|
||||
assert job.starts_and_stops[0]['start']
|
||||
assert job.starts_and_stops[0]['stop']
|
||||
assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start']
|
||||
assert site.status == 'FINISHED'
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start']
|
||||
assert site.starts_and_stops[0]['stop']
|
||||
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
||||
|
||||
frontier.resume_site(site)
|
||||
job = frontier.job(job.id)
|
||||
|
||||
assert job.status == 'ACTIVE'
|
||||
assert len(job.starts_and_stops) == 2
|
||||
assert job.starts_and_stops[1]['start']
|
||||
assert job.starts_and_stops[1]['stop'] is None
|
||||
assert site.status == 'ACTIVE'
|
||||
assert len(site.starts_and_stops) == 2
|
||||
assert site.starts_and_stops[1]['start']
|
||||
assert site.starts_and_stops[1]['stop'] is None
|
||||
|
||||
frontier.finished(site, 'FINISHED')
|
||||
job = frontier.job(job.id)
|
||||
|
||||
assert job.status == 'FINISHED'
|
||||
assert len(job.starts_and_stops) == 2
|
||||
assert job.starts_and_stops[1]['start']
|
||||
assert job.starts_and_stops[1]['stop']
|
||||
assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start']
|
||||
assert site.status == 'FINISHED'
|
||||
assert len(site.starts_and_stops) == 2
|
||||
assert site.starts_and_stops[1]['start']
|
||||
assert site.starts_and_stops[1]['stop']
|
||||
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
||||
|
||||
# resuming a job == resuming all of its sites
|
||||
frontier.resume_job(job)
|
||||
site = list(frontier.job_sites(job.id))[0]
|
||||
|
||||
assert job.status == 'ACTIVE'
|
||||
assert len(job.starts_and_stops) == 3
|
||||
assert job.starts_and_stops[2]['start']
|
||||
assert job.starts_and_stops[2]['stop'] is None
|
||||
assert site.status == 'ACTIVE'
|
||||
assert len(site.starts_and_stops) == 3
|
||||
assert site.starts_and_stops[2]['start']
|
||||
assert site.starts_and_stops[2]['stop'] is None
|
||||
|
||||
frontier.finished(site, 'FINISHED')
|
||||
job = frontier.job(job.id)
|
||||
|
||||
assert job.status == 'FINISHED'
|
||||
assert len(job.starts_and_stops) == 3
|
||||
assert job.starts_and_stops[2]['start']
|
||||
assert job.starts_and_stops[2]['stop']
|
||||
assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start']
|
||||
assert site.status == 'FINISHED'
|
||||
assert len(site.starts_and_stops) == 3
|
||||
assert site.starts_and_stops[2]['start']
|
||||
assert site.starts_and_stops[2]['stop']
|
||||
assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
|
||||
|
||||
def test_time_limit():
|
||||
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
||||
r = rethinkstuff.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
site = brozzler.Site(seed='http://example.com/', time_limit=99999)
|
||||
brozzler.new_site(frontier, site)
|
||||
|
||||
site = frontier.site(site.id) # get it back from the db
|
||||
assert site.status == 'ACTIVE'
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start']
|
||||
assert site.starts_and_stops[0]['stop'] is None
|
||||
|
||||
frontier.finished(site, 'FINISHED')
|
||||
|
||||
assert site.status == 'FINISHED'
|
||||
assert len(site.starts_and_stops) == 1
|
||||
assert site.starts_and_stops[0]['start']
|
||||
assert site.starts_and_stops[0]['stop']
|
||||
assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start']
|
||||
|
||||
frontier.resume_site(site)
|
||||
|
||||
assert site.status == 'ACTIVE'
|
||||
assert len(site.starts_and_stops) == 2
|
||||
assert site.starts_and_stops[1]['start']
|
||||
assert site.starts_and_stops[1]['stop'] is None
|
||||
|
||||
# time limit not reached yet
|
||||
frontier._enforce_time_limit(site)
|
||||
|
||||
assert site.status == 'ACTIVE'
|
||||
assert len(site.starts_and_stops) == 2
|
||||
assert site.starts_and_stops[1]['start']
|
||||
assert site.starts_and_stops[1]['stop'] is None
|
||||
|
||||
site.time_limit = 0.1
|
||||
site.claimed = True
|
||||
frontier.update_site(site)
|
||||
|
||||
time.sleep(0.1)
|
||||
frontier._enforce_time_limit(site)
|
||||
|
||||
assert site.status == 'FINISHED_TIME_LIMIT'
|
||||
assert not site.claimed
|
||||
assert len(site.starts_and_stops) == 2
|
||||
assert site.starts_and_stops[1]['start']
|
||||
assert site.starts_and_stops[1]['stop']
|
||||
assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
|
Loading…
x
Reference in New Issue
Block a user