use the uncanonicalized url as part of the sha1 input to generate the page id, since canonicalization was stripping off the #fragment, and we might want to crawl the same url with different fragments (and there's no option to GoogleURLCanonicalizer to not strip the fragment)

This commit is contained in:
Noah Levitt 2016-04-21 22:01:49 +00:00
parent dd8f0d525d
commit 568a553432
2 changed files with 2 additions and 3 deletions

View File

@ -105,8 +105,7 @@ class Page(brozzler.BaseDictable):
if id is not None:
self.id = id
else:
digest_this = "site_id:{},canon_url:{}".format(
self.site_id, self.canon_url())
digest_this = "site_id:{},url:{}".format(self.site_id, self.url)
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def __repr__(self):

View File

@ -2,7 +2,7 @@ import setuptools
import glob
setuptools.setup(name='brozzler',
version='1.1.dev4',
version='1.1.dev5',
description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler',
author='Noah Levitt',