Use warcprox if enable_warcprox_features is true

This commit is contained in:
Mouse Reeve 2016-10-18 17:39:33 -07:00
parent a370e7b987
commit 2215aaab21
2 changed files with 79 additions and 5 deletions

View File

@ -18,20 +18,17 @@ See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
''' '''
import os
import logging import logging
import brozzler import brozzler
import brozzler.browser import brozzler.browser
import threading import threading
import time import time
import signal
import youtube_dl import youtube_dl
import urllib.request import urllib.request
import json import json
import PIL.Image import PIL.Image
import io import io
import socket import socket
import datetime
import collections import collections
import requests import requests
import rethinkstuff import rethinkstuff
@ -114,7 +111,23 @@ class BrozzlerWorker:
self._browsing_threads = set() self._browsing_threads = set()
def _proxy(self, site): def _proxy(self, site):
return site.proxy or self.__proxy if site.proxy:
return site.proxy
elif self.__proxy:
return self.__proxy
elif self._service_registry and (
site.enable_warcprox_features or
self.__enable_warcprox_features):
warcprox_service = self._service_registry.available_service('warcprox')
site.proxy = '%s:%s' % (warcprox_service['host'],
warcprox_service['port'])
self._frontier.update_site(site)
self.logger.info(
'chose warcprox %s from service registry for site %s',
site.proxy, site)
return site.proxy
return None
def _enable_warcprox_features(self, site): def _enable_warcprox_features(self, site):
if site.enable_warcprox_features is not None: if site.enable_warcprox_features is not None:

View File

@ -139,3 +139,64 @@ def test_brozzle_site(httpd):
expected_payload = open(os.path.join( expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload assert requests.get(wb_url).content == expected_payload
def test_warcprox_selection(httpd):
''' When enable_warcprox_features is true, brozzler is expected to choose
and instance of warcprox '''
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
assert site.id is None
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
assert site.id is not None
assert len(list(frontier.site_pages(site.id))) == 1
# check proxy is set in rethink
start = time.time()
while not site.proxy and time.time() - start < 20:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.proxy[-5:] == ':8000'
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
assert 'screenshot:%s' % page1 in captures_by_url
assert 'thumbnail:%s' % page1 in captures_by_url
# no screenshots of plaintext
# check pywb
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload