convert domain specific rule url prefixes to our style of surt

This commit is contained in:
Noah Levitt 2016-07-19 14:31:43 -05:00
parent 7d9f019e67
commit ac3a71742d
3 changed files with 59 additions and 14 deletions

View File

@ -134,6 +134,7 @@ class BrozzlerEasyController:
def _init_pywb(self, args): def _init_pywb(self, args):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs() brozzler.pywb.support_in_progress_warcs()
if args.warcs_dir.endswith('/'): if args.warcs_dir.endswith('/'):
@ -156,7 +157,6 @@ class BrozzlerEasyController:
'framed_replay': True, 'framed_replay': True,
'port': args.pywb_port, 'port': args.pywb_port,
'enable_auto_colls': False, 'enable_auto_colls': False,
# 'domain_specific_rules': '/Users/nlevitt/workspace/brozzler-easy/pywb-rethinkdb/rules.yaml'
} }
wsgi_app = pywb.framework.wsgi_wrappers.init_app( wsgi_app = pywb.framework.wsgi_wrappers.init_app(
pywb.webapp.pywb_init.create_wb_router, config=conf, pywb.webapp.pywb_init.create_wb_router, config=conf,
@ -266,4 +266,3 @@ def main():
signal.signal(signal.SIGQUIT, controller.dump_state) signal.signal(signal.SIGQUIT, controller.dump_state)
controller.start() controller.start()
controller.wait_for_shutdown_request() controller.wait_for_shutdown_request()

View File

@ -4,18 +4,17 @@ brozzler/pywb.py - pywb support for rethinkdb index
Copyright (C) 2016 Internet Archive Copyright (C) 2016 Internet Archive
This program is free software: you can redistribute it and/or modify Licensed under the Apache License, Version 2.0 (the "License");
it under the terms of the GNU Affero General Public License as you may not use this file except in compliance with the License.
published by the Free Software Foundation, either version 3 of the You may obtain a copy of the License at
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, http://www.apache.org/licenses/LICENSE-2.0
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License Unless required by applicable law or agreed to in writing, software
along with this program. If not, see <http://www.gnu.org/licenses/>. distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
''' '''
import sys import sys
@ -129,6 +128,54 @@ class TheGoodUrlCanonicalizer(object):
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = ( pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
TheGoodUrlCanonicalizer,) TheGoodUrlCanonicalizer,)
def good_surts_from_default(default_surt):
'''
Takes a standard surt without scheme and without trailing comma, and
returns a list of "good" surts that together match the same set of
urls. For example:
good_surts_from_default('com,example)/path')
returns
['http://(com,example,)/path',
'https://(com,example,)/path',
'http://(com,example,www,)/path',
'https://(com,example,www,)/path']
'''
if default_surt == '':
return ['']
parts = default_surt.split(')', 1)
if len(parts) == 2:
orig_host_part, path_part = parts
good_surts = [
'http://(%s,)%s' % (orig_host_part, path_part),
'https://(%s,)%s' % (orig_host_part, path_part),
'http://(%s,www,)%s' % (orig_host_part, path_part),
'https://(%s,www,)%s' % (orig_host_part, path_part),
]
else: # no path part
host_part = parts[0]
good_surts = [
'http://(%s' % host_part,
'https://(%s' % host_part,
]
return good_surts
def monkey_patch_dsrules_init():
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
def cdx_dsrule_init(self, url_prefix, rules):
orig_init(self, url_prefix, rules)
good_surts = []
for url_prefix in self.url_prefix:
good_surts.extend(
TheGoodUrlCanonicalizer.good_surts_from_default(
url_prefix))
self.url_prefix = good_surts
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
def support_in_progress_warcs(): def support_in_progress_warcs():
''' '''
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
@ -145,4 +192,3 @@ def support_in_progress_warcs():
results.append('%s.open' % warc_path) results.append('%s.open' % warc_path)
return results return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call

View File

@ -21,7 +21,7 @@ import setuptools
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b3.dev53', version='1.1b3.dev54',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',