mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
convert domain specific rule url prefixes to our style of surt
This commit is contained in:
parent
7d9f019e67
commit
ac3a71742d
@ -134,6 +134,7 @@ class BrozzlerEasyController:
|
|||||||
|
|
||||||
def _init_pywb(self, args):
|
def _init_pywb(self, args):
|
||||||
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||||
|
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
||||||
brozzler.pywb.support_in_progress_warcs()
|
brozzler.pywb.support_in_progress_warcs()
|
||||||
|
|
||||||
if args.warcs_dir.endswith('/'):
|
if args.warcs_dir.endswith('/'):
|
||||||
@ -156,7 +157,6 @@ class BrozzlerEasyController:
|
|||||||
'framed_replay': True,
|
'framed_replay': True,
|
||||||
'port': args.pywb_port,
|
'port': args.pywb_port,
|
||||||
'enable_auto_colls': False,
|
'enable_auto_colls': False,
|
||||||
# 'domain_specific_rules': '/Users/nlevitt/workspace/brozzler-easy/pywb-rethinkdb/rules.yaml'
|
|
||||||
}
|
}
|
||||||
wsgi_app = pywb.framework.wsgi_wrappers.init_app(
|
wsgi_app = pywb.framework.wsgi_wrappers.init_app(
|
||||||
pywb.webapp.pywb_init.create_wb_router, config=conf,
|
pywb.webapp.pywb_init.create_wb_router, config=conf,
|
||||||
@ -266,4 +266,3 @@ def main():
|
|||||||
signal.signal(signal.SIGQUIT, controller.dump_state)
|
signal.signal(signal.SIGQUIT, controller.dump_state)
|
||||||
controller.start()
|
controller.start()
|
||||||
controller.wait_for_shutdown_request()
|
controller.wait_for_shutdown_request()
|
||||||
|
|
||||||
|
@ -4,18 +4,17 @@ brozzler/pywb.py - pywb support for rethinkdb index
|
|||||||
|
|
||||||
Copyright (C) 2016 Internet Archive
|
Copyright (C) 2016 Internet Archive
|
||||||
|
|
||||||
This program is free software: you can redistribute it and/or modify
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
it under the terms of the GNU Affero General Public License as
|
you may not use this file except in compliance with the License.
|
||||||
published by the Free Software Foundation, either version 3 of the
|
You may obtain a copy of the License at
|
||||||
License, or (at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU Affero General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Affero General Public License
|
Unless required by applicable law or agreed to in writing, software
|
||||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
@ -129,6 +128,54 @@ class TheGoodUrlCanonicalizer(object):
|
|||||||
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = (
|
||||||
TheGoodUrlCanonicalizer,)
|
TheGoodUrlCanonicalizer,)
|
||||||
|
|
||||||
|
def good_surts_from_default(default_surt):
|
||||||
|
'''
|
||||||
|
Takes a standard surt without scheme and without trailing comma, and
|
||||||
|
returns a list of "good" surts that together match the same set of
|
||||||
|
urls. For example:
|
||||||
|
|
||||||
|
good_surts_from_default('com,example)/path')
|
||||||
|
|
||||||
|
returns
|
||||||
|
|
||||||
|
['http://(com,example,)/path',
|
||||||
|
'https://(com,example,)/path',
|
||||||
|
'http://(com,example,www,)/path',
|
||||||
|
'https://(com,example,www,)/path']
|
||||||
|
|
||||||
|
'''
|
||||||
|
if default_surt == '':
|
||||||
|
return ['']
|
||||||
|
|
||||||
|
parts = default_surt.split(')', 1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
orig_host_part, path_part = parts
|
||||||
|
good_surts = [
|
||||||
|
'http://(%s,)%s' % (orig_host_part, path_part),
|
||||||
|
'https://(%s,)%s' % (orig_host_part, path_part),
|
||||||
|
'http://(%s,www,)%s' % (orig_host_part, path_part),
|
||||||
|
'https://(%s,www,)%s' % (orig_host_part, path_part),
|
||||||
|
]
|
||||||
|
else: # no path part
|
||||||
|
host_part = parts[0]
|
||||||
|
good_surts = [
|
||||||
|
'http://(%s' % host_part,
|
||||||
|
'https://(%s' % host_part,
|
||||||
|
]
|
||||||
|
return good_surts
|
||||||
|
|
||||||
|
def monkey_patch_dsrules_init():
|
||||||
|
orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
|
||||||
|
def cdx_dsrule_init(self, url_prefix, rules):
|
||||||
|
orig_init(self, url_prefix, rules)
|
||||||
|
good_surts = []
|
||||||
|
for url_prefix in self.url_prefix:
|
||||||
|
good_surts.extend(
|
||||||
|
TheGoodUrlCanonicalizer.good_surts_from_default(
|
||||||
|
url_prefix))
|
||||||
|
self.url_prefix = good_surts
|
||||||
|
pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init
|
||||||
|
|
||||||
def support_in_progress_warcs():
|
def support_in_progress_warcs():
|
||||||
'''
|
'''
|
||||||
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still
|
||||||
@ -145,4 +192,3 @@ def support_in_progress_warcs():
|
|||||||
results.append('%s.open' % warc_path)
|
results.append('%s.open' % warc_path)
|
||||||
return results
|
return results
|
||||||
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||||
|
|
||||||
|
2
setup.py
2
setup.py
@ -21,7 +21,7 @@ import setuptools
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b3.dev53',
|
version='1.1b3.dev54',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user