From 05fab8b909a7c1b346b9f5428b781c9d226bb955 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 12 Nov 2018 16:21:38 -0800 Subject: [PATCH 1/2] change time limit enforcement enforce time limit based on all the time that a site was in active rotation, including time it spent waiting for its turn to be brozzled; this undoes the change from b9640b8a30c934, because now it seems that was the wrong decision (brozzler jobs with many seeds and low max_claimed_sites hanging around forever) --- brozzler/frontier.py | 10 ++++------ brozzler/worker.py | 2 +- tests/test_frontier.py | 24 ++++++------------------ 3 files changed, 11 insertions(+), 25 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 5272a88..3826abf 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -152,17 +152,15 @@ class RethinkDbFrontier: else: raise brozzler.NothingToClaim - def enforce_time_limit(self, site, session_time=0): + def enforce_time_limit(self, site): ''' Raises `brozzler.ReachedTimeLimit` if appropriate. ''' - if (site.time_limit - and site.time_limit > 0 - and (site.active_brozzling_time or 0) + session_time > site.time_limit): + if (site.time_limit and site.time_limit > 0 + and site.elapsed() > site.time_limit): self.logger.debug( "site FINISHED_TIME_LIMIT! time_limit=%s " - "active_brozzling_time=%s %s", site.time_limit, - site.active_brozzling_time, site) + "elapsed=%s %s", site.time_limit, site.elapsed(), site) raise brozzler.ReachedTimeLimit def claim_page(self, site, worker_id): diff --git a/brozzler/worker.py b/brozzler/worker.py index 4aad2a3..8121770 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -345,7 +345,7 @@ class BrozzlerWorker: self._proxy_for(site), site) while time.time() - start < self.SITE_SESSION_MINUTES * 60: site.refresh() - self._frontier.enforce_time_limit(site, time.time() - start) + self._frontier.enforce_time_limit(site) self._frontier.honor_stop_request(site) page = self._frontier.claim_page(site, "%s:%s" % ( socket.gethostname(), browser.chrome.port)) diff --git a/tests/test_frontier.py b/tests/test_frontier.py index d66773e..800da1e 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -21,6 +21,7 @@ limitations under the License. import argparse import datetime import logging +import time import doublethink import pytest @@ -375,15 +376,10 @@ def test_time_limit(): assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None - # time limit not reached yet + # no time limit set frontier.enforce_time_limit(site) - assert site.status == 'ACTIVE' - assert len(site.starts_and_stops) == 2 - assert site.starts_and_stops[1]['start'] - assert site.starts_and_stops[1]['stop'] is None - - site.time_limit = 0.1 + site.time_limit = 10 site.claimed = True site.save() @@ -394,19 +390,11 @@ def test_time_limit(): assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None - site.active_brozzling_time = 0.2 # this is why the time limit will be hit + site.time_limit = 0.1 + time.sleep(0.1) - try: + with pytest.raises(brozzler.ReachedTimeLimit): frontier.enforce_time_limit(site) - except brozzler.ReachedTimeLimit: - frontier.finished(site, 'FINISHED_TIME_LIMIT') - - assert site.status == 'FINISHED_TIME_LIMIT' - assert not site.claimed - assert len(site.starts_and_stops) == 2 - assert site.starts_and_stops[1]['start'] - assert site.starts_and_stops[1]['stop'] - assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start'] def test_field_defaults(): rr = doublethink.Rethinker('localhost', db='ignoreme') From ebcc063fe28e4e6d1e590787be3676a358832d0e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Nov 2018 14:52:11 -0800 Subject: [PATCH 2/2] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d873804..bc4b868 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev314', + version='1.5.dev315', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',