use the uncanonicalized url as part of the sha1 input to generate the page id, since canonicalization was stripping off the #fragment, and we might want to crawl the same url with different fragments (and there's no option to GoogleURLCanonicalizer to not strip the fragment)

2025-08-07 14:02:24 -04:00 · 2016-04-21 22:01:49 +00:00 · 2016-04-21 22:01:49 +00:00 · 568a553432
commit 568a553432
parent dd8f0d525d
2 changed files with 2 additions and 3 deletions
--- a/brozzler/site.py
+++ b/brozzler/site.py
@ -105,8 +105,7 @@ class Page(brozzler.BaseDictable):
        if id is not None:
            self.id = id
        else:
-            digest_this = "site_id:{},canon_url:{}".format(
+            digest_this = "site_id:{},url:{}".format(self.site_id, self.url)
                    self.site_id, self.canon_url())
            self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
    def __repr__(self):
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,7 @@ import setuptools
 import glob
 setuptools.setup(name='brozzler',
-        version='1.1.dev4',
+        version='1.1.dev5',
        description='Distributed web crawling with browsers',
        url='https://github.com/nlevitt/brozzler',
        author='Noah Levitt',