From e23fa68d65181470b83f27ce9412324435955d97 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 17 Oct 2019 13:47:33 -0700 Subject: [PATCH] fix bug clobbering own changes to parent_page and some other tweaks (python 3.5+, pytest logging config, ...) --- .travis.yml | 3 ++- README.rst | 2 +- brozzler/frontier.py | 26 ++++++++++++++++++-------- pytest.ini | 6 ++++++ setup.py | 1 - tests/test_cluster.py | 1 + tests/test_frontier.py | 2 +- 7 files changed, 29 insertions(+), 12 deletions(-) create mode 100644 pytest.ini diff --git a/.travis.yml b/.travis.yml index c20872e..b8a50cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ dist: xenial language: python python: -- 3.4 - 3.5 - 3.6 - 3.7 @@ -24,6 +23,8 @@ script: - DISPLAY=:1 py.test --tb=native -v tests after_failure: - chromium-browser --version +- sudo kill -QUIT $(sudo svstat /etc/service/warcprox | egrep -o 'pid [0-9]+' | awk '{print $2}') +- sudo kill -QUIT $(sudo svstat /etc/service/brozzler-worker | egrep -o 'pid [0-9]+' | awk '{print $2}') - sudo cat /var/log/warcprox.log - sudo cat /var/log/brozzler-worker.log - sudo cat /var/log/pywb.log diff --git a/README.rst b/README.rst index 85fe1f3..9f9c28a 100644 --- a/README.rst +++ b/README.rst @@ -19,7 +19,7 @@ Brozzler is designed to work in conjuction with warcprox for web archiving. Requirements ------------ -- Python 3.4 or later +- Python 3.5 or later - RethinkDB deployment - Chromium or Google Chrome >= version 64 diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 0e3b777..6715eb3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -314,7 +314,7 @@ class RethinkDbFrontier: ''' existing_page.priority += fresh_page.priority existing_page.hashtags = list(set( - existing_page.hashtags + fresh_page.hashtags)) + (existing_page.hashtags or []) + (fresh_page.hashtags or []))) existing_page.hops_off = min( existing_page.hops_off, fresh_page.hops_off) @@ -375,14 +375,18 @@ class RethinkDbFrontier: decisions['accepted'].add(fresh_page.url) if fresh_page.id in pages: page = pages[fresh_page.id] - page.hashtags = list(set((page.hashtags or []) - + fresh_page.hashtags)) - page.priority += fresh_page.priority + self._merge_page(page, fresh_page) counts['updated'] += 1 else: pages[fresh_page.id] = fresh_page counts['added'] += 1 + # make sure we're not stepping on our own toes in case we have a link + # back to parent_page, which I think happens because of hashtags + if parent_page.id in pages: + self._merge_page(parent_page, pages[parent_page.id]) + del pages[parent_page.id] + # insert/replace in batches of 50 to try to avoid this error: # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:" # there can be many pages and each one can be very large (many videos, @@ -392,8 +396,11 @@ class RethinkDbFrontier: try: self.logger.debug( 'inserting/replacing batch of %s pages', len(batch)) - result = self.rr.table('pages').insert( - batch, conflict='replace').run() + reql = self.rr.table('pages').insert(batch, conflict='replace') + self.logger.trace( + 'running query self.rr.table("pages").insert(%r, ' + 'conflict="replace")', batch) + result = reql.run() except Exception as e: self.logger.error( 'problem inserting/replacing batch of %s pages', @@ -450,12 +457,15 @@ class RethinkDbFrontier: Returns: iterator of brozzler.Page ''' - results = self.rr.table("pages").between( + query = self.rr.table("pages").between( [site_id, 1 if brozzled is True else 0, r.minval, r.minval], [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval], - index="priority_by_site").run() + index="priority_by_site") + self.logger.trace("running query: %r", query) + results = query.run() for result in results: + self.logger.trace("yielding result: %r", result) yield brozzler.Page(self.rr, result) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..18b7f86 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +# https://docs.pytest.org/en/latest/logging.html +# https://github.com/pytest-dev/pytest/issues/5296 +[pytest] +log_format = %(asctime)s.%(msecs)03d %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s +log_date_format = %Y-%m-%d %H:%M:%S + diff --git a/setup.py b/setup.py index 2cece97..6bb888d 100644 --- a/setup.py +++ b/setup.py @@ -95,7 +95,6 @@ setuptools.setup( 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', diff --git a/tests/test_cluster.py b/tests/test_cluster.py index e04624b..fcff145 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -32,6 +32,7 @@ import requests import subprocess import http.server import logging +import sys import warcprox # https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 800da1e..64f7ab5 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -733,7 +733,7 @@ def test_hashtag_seed(): assert pages[0].hashtags == ['#hash',] def test_hashtag_links(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker('localhost', db='test_hashtag_links') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.org/'})