mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
use the uncanonicalized url as part of the sha1 input to generate the page id, since canonicalization was stripping off the #fragment, and we might want to crawl the same url with different fragments (and there's no option to GoogleURLCanonicalizer to not strip the fragment)
This commit is contained in:
parent
dd8f0d525d
commit
568a553432
@ -105,8 +105,7 @@ class Page(brozzler.BaseDictable):
|
|||||||
if id is not None:
|
if id is not None:
|
||||||
self.id = id
|
self.id = id
|
||||||
else:
|
else:
|
||||||
digest_this = "site_id:{},canon_url:{}".format(
|
digest_this = "site_id:{},url:{}".format(self.site_id, self.url)
|
||||||
self.site_id, self.canon_url())
|
|
||||||
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
|
2
setup.py
2
setup.py
@ -2,7 +2,7 @@ import setuptools
|
|||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(name='brozzler',
|
||||||
version='1.1.dev4',
|
version='1.1.dev5',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user