mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-21 05:14:22 -04:00
fix pywb/brozzler replay of revisit records
This commit is contained in:
parent
27452990ee
commit
4044fcb647
2 changed files with 5 additions and 2 deletions
|
@ -63,13 +63,16 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
# short-circuit this step and create the CDXObject directly
|
# short-circuit this step and create the CDXObject directly
|
||||||
blob = {
|
blob = {
|
||||||
'url': record['url'],
|
'url': record['url'],
|
||||||
'mime': record['content_type'],
|
|
||||||
'status': str(record['response_code']),
|
'status': str(record['response_code']),
|
||||||
'digest': record['sha1base32'],
|
'digest': record['sha1base32'],
|
||||||
'length': str(record['length']), # XXX is this the right length?
|
'length': str(record['length']), # XXX is this the right length?
|
||||||
'offset': str(record['offset']),
|
'offset': str(record['offset']),
|
||||||
'filename': record['filename'],
|
'filename': record['filename'],
|
||||||
}
|
}
|
||||||
|
if record['warc_type'] != 'revisit':
|
||||||
|
blob['mime'] = record['content_type']
|
||||||
|
else:
|
||||||
|
blob['mime'] = 'warc/revisit'
|
||||||
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
||||||
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
||||||
record['canon_surt'], record['timestamp'],
|
record['canon_surt'], record['timestamp'],
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b7.dev99',
|
version='1.1b7.dev100',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue