mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Use warcprox if enable_warcprox_features is true
This commit is contained in:
parent
a370e7b987
commit
2215aaab21
@ -18,20 +18,17 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
import brozzler
|
import brozzler
|
||||||
import brozzler.browser
|
import brozzler.browser
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import signal
|
|
||||||
import youtube_dl
|
import youtube_dl
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import json
|
import json
|
||||||
import PIL.Image
|
import PIL.Image
|
||||||
import io
|
import io
|
||||||
import socket
|
import socket
|
||||||
import datetime
|
|
||||||
import collections
|
import collections
|
||||||
import requests
|
import requests
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
@ -114,7 +111,23 @@ class BrozzlerWorker:
|
|||||||
self._browsing_threads = set()
|
self._browsing_threads = set()
|
||||||
|
|
||||||
def _proxy(self, site):
|
def _proxy(self, site):
|
||||||
return site.proxy or self.__proxy
|
if site.proxy:
|
||||||
|
return site.proxy
|
||||||
|
elif self.__proxy:
|
||||||
|
return self.__proxy
|
||||||
|
elif self._service_registry and (
|
||||||
|
site.enable_warcprox_features or
|
||||||
|
self.__enable_warcprox_features):
|
||||||
|
warcprox_service = self._service_registry.available_service('warcprox')
|
||||||
|
site.proxy = '%s:%s' % (warcprox_service['host'],
|
||||||
|
warcprox_service['port'])
|
||||||
|
self._frontier.update_site(site)
|
||||||
|
self.logger.info(
|
||||||
|
'chose warcprox %s from service registry for site %s',
|
||||||
|
site.proxy, site)
|
||||||
|
return site.proxy
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _enable_warcprox_features(self, site):
|
def _enable_warcprox_features(self, site):
|
||||||
if site.enable_warcprox_features is not None:
|
if site.enable_warcprox_features is not None:
|
||||||
|
@ -139,3 +139,64 @@ def test_brozzle_site(httpd):
|
|||||||
expected_payload = open(os.path.join(
|
expected_payload = open(os.path.join(
|
||||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||||
assert requests.get(wb_url).content == expected_payload
|
assert requests.get(wb_url).content == expected_payload
|
||||||
|
|
||||||
|
|
||||||
|
def test_warcprox_selection(httpd):
|
||||||
|
''' When enable_warcprox_features is true, brozzler is expected to choose
|
||||||
|
and instance of warcprox '''
|
||||||
|
|
||||||
|
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
|
||||||
|
# the two pages we expect to be crawled
|
||||||
|
page1 = 'http://localhost:%s/' % httpd.server_port
|
||||||
|
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
||||||
|
|
||||||
|
site = brozzler.Site(
|
||||||
|
seed='http://localhost:%s/' % httpd.server_port,
|
||||||
|
enable_warcprox_features=True,
|
||||||
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||||
|
|
||||||
|
assert site.id is None
|
||||||
|
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
assert site.id is not None
|
||||||
|
assert len(list(frontier.site_pages(site.id))) == 1
|
||||||
|
|
||||||
|
# check proxy is set in rethink
|
||||||
|
start = time.time()
|
||||||
|
while not site.proxy and time.time() - start < 20:
|
||||||
|
time.sleep(0.5)
|
||||||
|
site = frontier.site(site.id)
|
||||||
|
assert site.proxy[-5:] == ':8000'
|
||||||
|
|
||||||
|
# the site should be brozzled fairly quickly
|
||||||
|
start = time.time()
|
||||||
|
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||||
|
time.sleep(0.5)
|
||||||
|
site = frontier.site(site.id)
|
||||||
|
assert site.status == 'FINISHED'
|
||||||
|
|
||||||
|
# check that we got the two pages we expected
|
||||||
|
pages = list(frontier.site_pages(site.id))
|
||||||
|
assert len(pages) == 2
|
||||||
|
assert {page.url for page in pages} == {
|
||||||
|
'http://localhost:%s/' % httpd.server_port,
|
||||||
|
'http://localhost:%s/file1.txt' % httpd.server_port}
|
||||||
|
|
||||||
|
# take a look at the captures table
|
||||||
|
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||||
|
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
|
assert page1 in captures_by_url
|
||||||
|
assert '%srobots.txt' % page1 in captures_by_url
|
||||||
|
assert page2 in captures_by_url
|
||||||
|
assert 'screenshot:%s' % page1 in captures_by_url
|
||||||
|
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||||
|
# no screenshots of plaintext
|
||||||
|
|
||||||
|
# check pywb
|
||||||
|
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||||
|
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||||
|
expected_payload = open(os.path.join(
|
||||||
|
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||||
|
assert requests.get(wb_url).content == expected_payload
|
||||||
|
Loading…
x
Reference in New Issue
Block a user