""" brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index, loading from warcs still being written to, canonicalization rules matching brozzler conventions, support for screenshot: and thumbnail: urls Copyright (C) 2016-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import sys import logging try: import pywb.apps.cli import pywb.cdx.cdxdomainspecific import pywb.cdx.cdxobject import pywb.cdx.cdxserver import pywb.webapp.query_handler import pywb.framework.basehandlers import pywb.rewrite.wburl except ImportError as e: logging.critical( '%s: %s\n\nYou might need to run "pip install ' 'brozzler[easy]".\nSee README.rst for more information.', type(e).__name__, e, ) sys.exit(1) import doublethink import rethinkdb as rdb import urlcanon import json import brozzler import argparse r = rdb.RethinkDB() class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): def __init__(self, servers, db, table): self.servers = servers self.db = db self.table = table @property def rr(self): try: return self._rr except AttributeError: self._rr = doublethink.Rethinker(self.servers, self.db) return self._rr def load_cdx(self, cdx_query): # logging.debug('vars(cdx_query)=%s', vars(cdx_query)) rethink_results = self._query_rethinkdb(cdx_query) return self._gen_cdx_lines(rethink_results) def _gen_cdx_lines(self, rethink_results): for record in rethink_results: # XXX inefficient, it gets parsed later, figure out how to # short-circuit this step and create the CDXObject directly blob = { "url": record["url"], "status": str(record["response_code"]), "digest": record["sha1base32"], "length": str(record.get("record_length", "-")), "offset": str(record["offset"]), "filename": record["filename"], } if record["warc_type"] != "revisit": blob["mime"] = record["content_type"] or "-" else: blob["mime"] = "warc/revisit" # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}' cdx_line = "{} {:%Y%m%d%H%M%S} {}".format( record["canon_surt"], record["timestamp"], json.dumps(blob) ) yield cdx_line.encode("utf-8") def _query_rethinkdb(self, cdx_query): start_key = cdx_query.key.decode("utf-8") end_key = cdx_query.end_key.decode("utf-8") reql = self.rr.table(self.table).between( [start_key[:150], r.minval], [end_key[:150], r.maxval], index="abbr_canon_surt_timestamp", right_bound="closed", ) reql = reql.order_by(index="abbr_canon_surt_timestamp") # TODO support for POST, etc # http_method='WARCPROX_WRITE_RECORD' for screenshots, thumbnails reql = reql.filter( lambda capture: r.expr(["WARCPROX_WRITE_RECORD", "GET"]).contains( capture["http_method"] ) ) reql = reql.filter( lambda capture: (capture["canon_surt"] >= start_key) & (capture["canon_surt"] < end_key) ) if cdx_query.limit: reql = reql.limit(cdx_query.limit) logging.debug("rethinkdb query: %s", reql) results = reql.run() return results class TheGoodUrlCanonicalizer(object): """ Replacement for pywb.utils.canonicalize.UrlCanonicalizer that produces surts with scheme and with trailing comma, and does not "massage" www.foo.org into foo.org. """ def __init__(self, surt_ordered=True): """We are always surt ordered (surt_ordered param is ignored)""" self.surt_ordered = True def __call__(self, url): try: key = urlcanon.semantic(url).surt().decode("ascii") # logging.debug('%s -> %s', url, key) return key except Exception as e: return url def replace_default_canonicalizer(): """Replace parent class of CustomUrlCanonicalizer with this class.""" pywb.cdx.cdxdomainspecific.CustomUrlCanonicalizer.__bases__ = ( TheGoodUrlCanonicalizer, ) def good_surts_from_default(default_surt): """ Takes a standard surt without scheme and without trailing comma, and returns a list of "good" surts that together match the same set of urls. For example: good_surts_from_default('com,example)/path') returns ['http://(com,example,)/path', 'https://(com,example,)/path', 'http://(com,example,www,)/path', 'https://(com,example,www,)/path'] """ if default_surt == "": return [""] parts = default_surt.split(")", 1) if len(parts) == 2: orig_host_part, path_part = parts good_surts = [ "http://(%s,)%s" % (orig_host_part, path_part), "https://(%s,)%s" % (orig_host_part, path_part), "http://(%s,www,)%s" % (orig_host_part, path_part), "https://(%s,www,)%s" % (orig_host_part, path_part), ] else: # no path part host_part = parts[0] good_surts = [ "http://(%s" % host_part, "https://(%s" % host_part, ] return good_surts def monkey_patch_dsrules_init(): orig_init = pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ def cdx_dsrule_init(self, url_prefix, rules): good_surts = [] url_prefixes = [url_prefix] if isinstance(url_prefix, str) else url_prefix for bad_surt in url_prefixes: good_surts.extend( TheGoodUrlCanonicalizer.good_surts_from_default(bad_surt) ) if "match" in rules and "regex" in rules["match"]: rules["match"]["regex"] = r"https?://\(" + rules["match"]["regex"] orig_init(self, good_surts, rules) pywb.cdx.cdxdomainspecific.CDXDomainSpecificRule.__init__ = cdx_dsrule_init def support_in_progress_warcs(): """ Monkey-patch pywb.warc.pathresolvers.PrefixResolver to include warcs still being written to (warcs having ".open" suffix). This way if a cdx entry references foo.warc.gz, pywb will try both foo.warc.gz and foo.warc.gz.open. """ _orig_prefix_resolver_call = pywb.warc.pathresolvers.PrefixResolver.__call__ def _prefix_resolver_call(self, filename, cdx=None): raw_results = _orig_prefix_resolver_call(self, filename, cdx) results = [] for warc_path in raw_results: results.append(warc_path) results.append("%s.open" % warc_path) return results pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call class SomeWbUrl(pywb.rewrite.wburl.WbUrl): def __init__(self, orig_url): import re import six from six.moves.urllib.parse import urlsplit, urlunsplit from six.moves.urllib.parse import quote_plus, quote, unquote_plus from pywb.utils.loaders import to_native_str from pywb.rewrite.wburl import WbUrl pywb.rewrite.wburl.BaseWbUrl.__init__(self) if six.PY2 and isinstance(orig_url, six.text_type): orig_url = orig_url.encode("utf-8") orig_url = quote(orig_url) self._original_url = orig_url if not self._init_query(orig_url): if not self._init_replay(orig_url): raise Exception("Invalid WbUrl: ", orig_url) new_uri = WbUrl.to_uri(self.url) self._do_percent_encode = True self.url = new_uri # begin brozzler changes if ( self.url.startswith("urn:") or self.url.startswith("screenshot:") or self.url.startswith("thumbnail:") ): return # end brozzler changes # protocol agnostic url -> http:// # no protocol -> http:// # inx = self.url.find('://') inx = -1 m = self.SCHEME_RX.match(self.url) if m: inx = m.span(1)[0] # if inx < 0: # check for other partially encoded variants # m = self.PARTIAL_ENC_RX.match(self.url) # if m: # len_ = len(m.group(0)) # self.url = (urllib.unquote_plus(self.url[:len_]) + # self.url[len_:]) # inx = self.url.find(':/') if inx < 0: self.url = self.DEFAULT_SCHEME + self.url else: inx += 2 if inx < len(self.url) and self.url[inx] != "/": self.url = self.url[:inx] + "/" + self.url[inx:] def _get_wburl_type(self): return SomeWbUrl def monkey_patch_wburl(): pywb.framework.basehandlers.WbUrlHandler.get_wburl_type = _get_wburl_type class BrozzlerWaybackCli(pywb.apps.cli.WaybackCli): def _extend_parser(self, arg_parser): super()._extend_parser(arg_parser) arg_parser._actions[4].help = argparse.SUPPRESS # --autoindex arg_parser.formatter_class = argparse.RawDescriptionHelpFormatter arg_parser.epilog = """ Run pywb like so: $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback See README.rst for more information. """ # copied and pasted from cdxdomainspecific.py, only changes are commented as # such below def _fuzzy_query_call(self, query): # imports added here for brozzler from pywb.utils.loaders import to_native_str from six.moves.urllib.parse import urlsplit, urlunsplit matched_rule = None urlkey = to_native_str(query.key, "utf-8") url = query.url filter_ = query.filters output = query.output for rule in self.rules.iter_matching(urlkey): m = rule.regex.search(urlkey) if not m: continue matched_rule = rule groups = m.groups() for g in groups: for f in matched_rule.filter: filter_.append(f.format(g)) break if not matched_rule: return None repl = "?" if matched_rule.replace: repl = matched_rule.replace inx = url.find(repl) if inx > 0: url = url[: inx + len(repl)] # begin brozzler changes if matched_rule.match_type == "domain": orig_split_url = urlsplit(url) # remove the subdomain, path, query and fragment host = orig_split_url.netloc.split(".", 1)[1] new_split_url = (orig_split_url.scheme, host, "", "", "") url = urlunsplit(new_split_url) # end brozzler changes params = query.params params.update({"url": url, "matchType": matched_rule.match_type, "filter": filter_}) if "reverse" in params: del params["reverse"] if "closest" in params: del params["closest"] if "end_key" in params: del params["end_key"] return params def monkey_patch_fuzzy_query(): pywb.cdx.cdxdomainspecific.FuzzyQuery.__call__ = _fuzzy_query_call # copied and pasted from pywb/utils/canonicalize.py, only changes are commented # as such def _calc_search_range(url, match_type, surt_ordered=True, url_canon=None): # imports added here for brozzler from pywb.utils.canonicalize import UrlCanonicalizer, UrlCanonicalizeException import six.moves.urllib.parse as urlparse def inc_last_char(x): return x[0:-1] + chr(ord(x[-1]) + 1) if not url_canon: # make new canon url_canon = UrlCanonicalizer(surt_ordered) else: # ensure surt order matches url_canon surt_ordered = url_canon.surt_ordered start_key = url_canon(url) if match_type == "exact": end_key = start_key + "!" elif match_type == "prefix": # add trailing slash if url has it if url.endswith("/") and not start_key.endswith("/"): start_key += "/" end_key = inc_last_char(start_key) elif match_type == "host": if surt_ordered: host = start_key.split(")/")[0] start_key = host + ")/" end_key = host + "*" else: host = urlparse.urlsplit(url).netloc start_key = host + "/" end_key = host + "0" elif match_type == "domain": if not surt_ordered: msg = "matchType=domain unsupported for non-surt" raise UrlCanonicalizeException(msg) host = start_key.split(")/")[0] # if tld, use com, as start_key # otherwise, stick with com,example)/ if "," not in host: start_key = host + "," else: start_key = host + ")/" # begin brozzler changes end_key = host + "~" # end brozzler changes else: raise UrlCanonicalizeException("Invalid match_type: " + match_type) return (start_key, end_key) def monkey_patch_calc_search_range(): pywb.utils.canonicalize.calc_search_range = _calc_search_range pywb.cdx.query.calc_search_range = _calc_search_range def main(argv=sys.argv): brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() brozzler.pywb.support_in_progress_warcs() brozzler.pywb.monkey_patch_wburl() brozzler.pywb.monkey_patch_fuzzy_query() brozzler.pywb.monkey_patch_calc_search_range() wayback_cli = BrozzlerWaybackCli( args=argv[1:], default_port=8880, desc=( "brozzler-wayback - pywb wayback (monkey-patched for use " "with brozzler)" ), ) wayback_cli.run()