fix pywb/brozzler replay of revisit records

2025-04-19 23:35:54 -04:00 · 2016-10-14 19:15:23 -07:00 · 2016-10-14 19:15:23 -07:00 · 4044fcb647
commit 4044fcb647
parent 27452990ee
2 changed files with 5 additions and 2 deletions
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -63,13 +63,16 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
            # short-circuit this step and create the CDXObject directly
            blob = {
                'url': record['url'],
-                'mime': record['content_type'],
                'status': str(record['response_code']),
                'digest': record['sha1base32'],
                'length': str(record['length']), # XXX is this the right length?
                'offset': str(record['offset']),
                'filename': record['filename'],
            }
+            if record['warc_type'] != 'revisit':
+                blob['mime'] = record['content_type']
+            else:
+                blob['mime'] = 'warc/revisit'
            # b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
            cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
                    record['canon_surt'], record['timestamp'],
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b7.dev99',
+        version='1.1b7.dev100',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',