From ac3a71742defddabe2ecbe7c79b27a94901c0ee2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 19 Jul 2016 14:31:43 -0500 Subject: [PATCH] convert domain specific rule url prefixes to our style of surt --- brozzler/easy.py | 3 +-- brozzler/pywb.py | 68 ++++++++++++++++++++++++++++++++++++++++-------- setup.py | 2 +- 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/brozzler/easy.py b/brozzler/easy.py index e8a0d21..197f39b 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -134,6 +134,7 @@ class BrozzlerEasyController: def _init_pywb(self, args): brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() + brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() brozzler.pywb.support_in_progress_warcs() if args.warcs_dir.endswith('/'): @@ -156,7 +157,6 @@ class BrozzlerEasyController: 'framed_replay': True, 'port': args.pywb_port, 'enable_auto_colls': False, - # 'domain_specific_rules': '/Users/nlevitt/workspace/brozzler-easy/pywb-rethinkdb/rules.yaml' } wsgi_app = pywb.framework.wsgi_wrappers.init_app( pywb.webapp.pywb_init.create_wb_router, config=conf, @@ -266,4 +266,3 @@ def main(): signal.signal(signal.SIGQUIT, controller.dump_state) controller.start() controller.wait_for_shutdown_request() - diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 633b959..8f1ece8 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -4,18 +4,17 @@ brozzler/pywb.py - pywb support for rethinkdb index Copyright (C) 2016 Internet Archive -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU Affero General Public License as -published by the Free Software Foundation, either version 3 of the -License, or (at your option) any later version. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU Affero General Public License for more details. + http://www.apache.org/licenses/LICENSE-2.0 -You should have received a copy of the GNU Affero General Public License -along with this program. If not, see . +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. ''' import sys @@ -129,6 +128,54 @@ class TheGoodUrlCanonicalizer(object): pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = ( TheGoodUrlCanonicalizer,) + def good_surts_from_default(default_surt): + ''' + Takes a standard surt without scheme and without trailing comma, and + returns a list of "good" surts that together match the same set of + urls. For example: + + good_surts_from_default('com,example)/path') + + returns + + ['http://(com,example,)/path', + 'https://(com,example,)/path', + 'http://(com,example,www,)/path', + 'https://(com,example,www,)/path'] + + ''' + if default_surt == '': + return [''] + + parts = default_surt.split(')', 1) + if len(parts) == 2: + orig_host_part, path_part = parts + good_surts = [ + 'http://(%s,)%s' % (orig_host_part, path_part), + 'https://(%s,)%s' % (orig_host_part, path_part), + 'http://(%s,www,)%s' % (orig_host_part, path_part), + 'https://(%s,www,)%s' % (orig_host_part, path_part), + ] + else: # no path part + host_part = parts[0] + good_surts = [ + 'http://(%s' % host_part, + 'https://(%s' % host_part, + ] + return good_surts + + def monkey_patch_dsrules_init(): + orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ + def cdx_dsrule_init(self, url_prefix, rules): + orig_init(self, url_prefix, rules) + good_surts = [] + for url_prefix in self.url_prefix: + good_surts.extend( + TheGoodUrlCanonicalizer.good_surts_from_default( + url_prefix)) + self.url_prefix = good_surts + pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init + def support_in_progress_warcs(): ''' Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still @@ -145,4 +192,3 @@ def support_in_progress_warcs(): results.append('%s.open' % warc_path) return results pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call - diff --git a/setup.py b/setup.py index 9a57376..e7da824 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1b3.dev53', + version='1.1b3.dev54', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',