Use warcprox if enable_warcprox_features is true

This commit is contained in:
Mouse Reeve 2016-10-18 17:39:33 -07:00
parent a370e7b987
commit 2215aaab21
2 changed files with 79 additions and 5 deletions

View File

@ -18,20 +18,17 @@ See the License for the specific language governing permissions and
limitations under the License.
'''
import os
import logging
import brozzler
import brozzler.browser
import threading
import time
import signal
import youtube_dl
import urllib.request
import json
import PIL.Image
import io
import socket
import datetime
import collections
import requests
import rethinkstuff
@ -114,7 +111,23 @@ class BrozzlerWorker:
self._browsing_threads = set()
def _proxy(self, site):
return site.proxy or self.__proxy
if site.proxy:
return site.proxy
elif self.__proxy:
return self.__proxy
elif self._service_registry and (
site.enable_warcprox_features or
self.__enable_warcprox_features):
warcprox_service = self._service_registry.available_service('warcprox')
site.proxy = '%s:%s' % (warcprox_service['host'],
warcprox_service['port'])
self._frontier.update_site(site)
self.logger.info(
'chose warcprox %s from service registry for site %s',
site.proxy, site)
return site.proxy
return None
def _enable_warcprox_features(self, site):
if site.enable_warcprox_features is not None:

View File

@ -121,7 +121,68 @@ def test_brozzle_site(httpd):
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port }
'http://localhost:%s/file1.txt' % httpd.server_port}
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
assert 'screenshot:%s' % page1 in captures_by_url
assert 'thumbnail:%s' % page1 in captures_by_url
# no screenshots of plaintext
# check pywb
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
def test_warcprox_selection(httpd):
''' When enable_warcprox_features is true, brozzler is expected to choose
and instance of warcprox '''
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
assert len(list(frontier.site_pages(site.id))) == 1
# check proxy is set in rethink
start = time.time()
while not site.proxy and time.time() - start < 20:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.proxy[-5:] == ':8000'
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()