Merge branch 'master' into qa

* master: more pywb monkey-patching to get at least some youtube videos captured by brozzler to play back
2025-07-14 18:49:32 -04:00 · 2017-02-23 10:43:15 -08:00 · 2017-02-23 10:43:15 -08:00 · b496bce320
commit b496bce320
parent cb75bb6e04 7417310d57
3 changed files with 146 additions and 5 deletions
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -150,6 +150,8 @@ class BrozzlerEasyController:
        brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
        brozzler.pywb.support_in_progress_warcs()
        brozzler.pywb.monkey_patch_wburl()
+        brozzler.pywb.monkey_patch_fuzzy_query()
+        brozzler.pywb.monkey_patch_calc_search_engine()

        if args.warcs_dir.endswith('/'):
            warcs_dir = args.warcs_dir
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -169,13 +169,16 @@ class TheGoodUrlCanonicalizer(object):
    def monkey_patch_dsrules_init():
        orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__
        def cdx_dsrule_init(self, url_prefix, rules):
-            orig_init(self, url_prefix, rules)
            good_surts = []
-            for url_prefix in self.url_prefix:
+            url_prefixes = [url_prefix] if isinstance(
+                    url_prefix, str) else url_prefix
+            for bad_surt in url_prefixes:
                good_surts.extend(
                        TheGoodUrlCanonicalizer.good_surts_from_default(
-                                url_prefix))
-            self.url_prefix = good_surts
+                                bad_surt))
+            if 'match' in rules and 'regex' in rules['match']:
+                rules['match']['regex'] = r'https?://\(' + rules['match']['regex']
+            orig_init(self, good_surts, rules)
        pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init

 def support_in_progress_warcs():
@ -273,11 +276,147 @@ Run pywb like so:
 See README.rst for more information.
 '''

+# copied and pasted from cdxdomainspecific.py, only changes are commented as
+# such below
+def _fuzzy_query_call(self, query):
+    # imports added here for brozzler
+    from pywb.utils.loaders import to_native_str
+    from six.moves.urllib.parse import urlsplit, urlunsplit
+
+    matched_rule = None
+
+    urlkey = to_native_str(query.key, 'utf-8')
+    url = query.url
+    filter_ = query.filters
+    output = query.output
+
+    for rule in self.rules.iter_matching(urlkey):
+        m = rule.regex.search(urlkey)
+        if not m:
+            continue
+
+        matched_rule = rule
+
+        groups = m.groups()
+        for g in groups:
+            for f in matched_rule.filter:
+                filter_.append(f.format(g))
+
+        break
+
+    if not matched_rule:
+        return None
+
+    repl = '?'
+    if matched_rule.replace:
+        repl = matched_rule.replace
+
+    inx = url.find(repl)
+    if inx > 0:
+        url = url[:inx + len(repl)]
+
+    # begin brozzler changes
+    if matched_rule.match_type == 'domain':
+        orig_split_url = urlsplit(url)
+        # remove the subdomain, path, query and fragment
+        host = orig_split_url.netloc.split('.', 1)[1]
+        new_split_url = (orig_split_url.scheme, host, '', '', '')
+        url = urlunsplit(new_split_url)
+    # end brozzler changes
+
+    params = query.params
+    params.update({'url': url,
+                   'matchType': matched_rule.match_type,
+                   'filter': filter_})
+
+    if 'reverse' in params:
+        del params['reverse']
+
+    if 'closest' in params:
+        del params['closest']
+
+    if 'end_key' in params:
+        del params['end_key']
+
+    return params
+
+def monkey_patch_fuzzy_query():
+    pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call
+
+# copied and pasted from pywb/utils/canonicalize.py, only changes are commented
+# as such
+def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None):
+    # imports added here for brozzler
+    from pywb.utils.canonicalize import UrlCanonicalizer, UrlCanonicalizeException
+    import six.moves.urllib.parse as urlparse
+
+    def inc_last_char(x):
+        return x[0:-1] + chr(ord(x[-1]) + 1)
+
+    if not url_canon:
+        # make new canon
+        url_canon = UrlCanonicalizer(surt_ordered)
+    else:
+        # ensure surt order matches url_canon
+        surt_ordered = url_canon.surt_ordered
+
+    start_key = url_canon(url)
+
+    if match_type == 'exact':
+        end_key = start_key + '!'
+
+    elif match_type == 'prefix':
+        # add trailing slash if url has it
+        if url.endswith('/') and not start_key.endswith('/'):
+            start_key += '/'
+
+        end_key = inc_last_char(start_key)
+
+    elif match_type == 'host':
+        if surt_ordered:
+            host = start_key.split(')/')[0]
+
+            start_key = host + ')/'
+            end_key = host + '*'
+        else:
+            host = urlparse.urlsplit(url).netloc
+
+            start_key = host + '/'
+            end_key = host + '0'
+
+    elif match_type == 'domain':
+        if not surt_ordered:
+            msg = 'matchType=domain unsupported for non-surt'
+            raise UrlCanonicalizeException(msg)
+
+        host = start_key.split(')/')[0]
+
+        # if tld, use com, as start_key
+        # otherwise, stick with com,example)/
+        if ',' not in host:
+            start_key = host + ','
+        else:
+            start_key = host + ')/'
+
+        # begin brozzler changes
+        end_key = host + '~'
+        # end brozzler changes
+    else:
+        raise UrlCanonicalizeException('Invalid match_type: ' + match_type)
+
+    return (start_key, end_key)
+
+def monkey_patch_calc_search_engine():
+    pywb.utils.canonicalize.calc_search_range = _calc_search_range
+    pywb.cdx.query.calc_search_range = _calc_search_range
+
 def main(argv=sys.argv):
    brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
    brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
    brozzler.pywb.support_in_progress_warcs()
    brozzler.pywb.monkey_patch_wburl()
+    brozzler.pywb.monkey_patch_fuzzy_query()
+    brozzler.pywb.monkey_patch_calc_search_engine()
    wayback_cli = BrozzlerWaybackCli(
            args=argv[1:], default_port=8880,
            desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b9.dev193',
+        version='1.1b9.dev194',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',