diff --git a/brozzler/worker.py b/brozzler/worker.py index 1cb6aa3..96a13cc 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -18,20 +18,17 @@ See the License for the specific language governing permissions and limitations under the License. ''' -import os import logging import brozzler import brozzler.browser import threading import time -import signal import youtube_dl import urllib.request import json import PIL.Image import io import socket -import datetime import collections import requests import rethinkstuff @@ -114,7 +111,23 @@ class BrozzlerWorker: self._browsing_threads = set() def _proxy(self, site): - return site.proxy or self.__proxy + if site.proxy: + return site.proxy + elif self.__proxy: + return self.__proxy + elif self._service_registry and ( + site.enable_warcprox_features or + self.__enable_warcprox_features): + warcprox_service = self._service_registry.available_service('warcprox') + site.proxy = '%s:%s' % (warcprox_service['host'], + warcprox_service['port']) + self._frontier.update_site(site) + self.logger.info( + 'chose warcprox %s from service registry for site %s', + site.proxy, site) + return site.proxy + return None + def _enable_warcprox_features(self, site): if site.enable_warcprox_features is not None: diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 29bb3d9..748950f 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -121,7 +121,68 @@ def test_brozzle_site(httpd): assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, - 'http://localhost:%s/file1.txt' % httpd.server_port } + 'http://localhost:%s/file1.txt' % httpd.server_port} + + # take a look at the captures table + captures = r.table('captures').filter({'test_id':test_id}).run() + captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} + assert page1 in captures_by_url + assert '%srobots.txt' % page1 in captures_by_url + assert page2 in captures_by_url + assert 'screenshot:%s' % page1 in captures_by_url + assert 'thumbnail:%s' % page1 in captures_by_url + # no screenshots of plaintext + + # check pywb + t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') + wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) + expected_payload = open(os.path.join( + os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() + assert requests.get(wb_url).content == expected_payload + + +def test_warcprox_selection(httpd): + ''' When enable_warcprox_features is true, brozzler is expected to choose + and instance of warcprox ''' + + test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat() + + # the two pages we expect to be crawled + page1 = 'http://localhost:%s/' % httpd.server_port + page2 = 'http://localhost:%s/file1.txt' % httpd.server_port + + site = brozzler.Site( + seed='http://localhost:%s/' % httpd.server_port, + enable_warcprox_features=True, + warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) + + assert site.id is None + r = rethinkstuff.Rethinker('localhost', db='brozzler') + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + assert site.id is not None + assert len(list(frontier.site_pages(site.id))) == 1 + + # check proxy is set in rethink + start = time.time() + while not site.proxy and time.time() - start < 20: + time.sleep(0.5) + site = frontier.site(site.id) + assert site.proxy[-5:] == ':8000' + + # the site should be brozzled fairly quickly + start = time.time() + while site.status != 'FINISHED' and time.time() - start < 300: + time.sleep(0.5) + site = frontier.site(site.id) + assert site.status == 'FINISHED' + + # check that we got the two pages we expected + pages = list(frontier.site_pages(site.id)) + assert len(pages) == 2 + assert {page.url for page in pages} == { + 'http://localhost:%s/' % httpd.server_port, + 'http://localhost:%s/file1.txt' % httpd.server_port} # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run()