use the uncanonicalized url as part of the sha1 input to generate the page id, since canonicalization was stripping off the #fragment, and we might want to crawl the same url with different fragments (and there's no option to GoogleURLCanonicalizer to not strip the fragment)

This commit is contained in:
Noah Levitt 2016-04-21 22:01:49 +00:00
parent dd8f0d525d
commit 568a553432
2 changed files with 2 additions and 3 deletions

View File

@ -105,8 +105,7 @@ class Page(brozzler.BaseDictable):
if id is not None: if id is not None:
self.id = id self.id = id
else: else:
digest_this = "site_id:{},canon_url:{}".format( digest_this = "site_id:{},url:{}".format(self.site_id, self.url)
self.site_id, self.canon_url())
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest() self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
def __repr__(self): def __repr__(self):

View File

@ -2,7 +2,7 @@ import setuptools
import glob import glob
setuptools.setup(name='brozzler', setuptools.setup(name='brozzler',
version='1.1.dev4', version='1.1.dev5',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler', url='https://github.com/nlevitt/brozzler',
author='Noah Levitt', author='Noah Levitt',