diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 4978bbd..6715eb3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -313,9 +313,6 @@ class RethinkDbFrontier: representing the same url but with possibly different metadata. ''' existing_page.priority += fresh_page.priority - self.logger.trace( - 'adding hashtags %r to existing hashtags %r for page %s', - existing_page.hashtags, fresh_page.hashtags, fresh_page.url) existing_page.hashtags = list(set( (existing_page.hashtags or []) + (fresh_page.hashtags or []))) existing_page.hops_off = min( @@ -371,8 +368,6 @@ class RethinkDbFrontier: # get existing pages from rethinkdb results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} - self.logger.trace('fresh_pages.keys()=%r', fresh_pages.keys()) - self.logger.trace('existing pages.keys()=%r', pages.keys()) # build list of pages to save, consisting of new pages, and existing # pages updated with higher priority and new hashtags @@ -422,10 +417,6 @@ class RethinkDbFrontier: counts['added'], counts['updated'], counts['rejected'], counts['blocked'], parent_page) - for page_id in pages: - page = self.rr.table('pages').get(page_id).run() - self.logger.trace('retrieved page after save: %r', page) - def reached_limit(self, site, e): self.logger.info("reached_limit site=%s e=%s", site, e) assert isinstance(e, brozzler.ReachedLimit) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index ac630f9..fcff145 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -510,10 +510,7 @@ def test_seed_redirect(httpd): {'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}, {'ssurt': '%s//%s:http:/site5/destination/' % (local_address, httpd.server_port)}]} -def test_hashtags(httpd, caplog): - caplog.set_level(0) # https://docs.pytest.org/en/latest/logging.html - logging.trace('here we are in test_hashtags') - +def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = make_url(httpd, '/site7/') @@ -532,8 +529,6 @@ def test_hashtags(httpd, caplog): site.refresh() assert site.status == 'FINISHED' - print(datetime.datetime.utcnow().isoformat() + ' finished brozzling site, loading pages from rethinkdb') - # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 9cdcfe1..64f7ab5 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -732,9 +732,7 @@ def test_hashtag_seed(): assert pages[0].url == 'http://example.org/' assert pages[0].hashtags == ['#hash',] -def test_hashtag_links(caplog): - caplog.set_level(0) # https://docs.pytest.org/en/latest/logging.html - +def test_hashtag_links(): rr = doublethink.Rethinker('localhost', db='test_hashtag_links') frontier = brozzler.RethinkDbFrontier(rr)