Use warcprox if enable_warcprox_features is true

2025-07-05 12:04:39 -04:00 · 2016-10-18 17:39:33 -07:00 · 2016-10-18 17:39:33 -07:00 · 2215aaab21
commit 2215aaab21
parent a370e7b987
2 changed files with 79 additions and 5 deletions
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -18,20 +18,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 '''
 import os
 import logging
 import brozzler
 import brozzler.browser
 import threading
 import time
 import signal
 import youtube_dl
 import urllib.request
 import json
 import PIL.Image
 import io
 import socket
 import datetime
 import collections
 import requests
 import rethinkstuff
@ -114,7 +111,23 @@ class BrozzlerWorker:
        self._browsing_threads = set()
    def _proxy(self, site):
-        return site.proxy or self.__proxy
+        if site.proxy:
            return site.proxy
        elif self.__proxy:
            return self.__proxy
        elif self._service_registry and (
                site.enable_warcprox_features or
                self.__enable_warcprox_features):
            warcprox_service = self._service_registry.available_service('warcprox')
            site.proxy = '%s:%s' % (warcprox_service['host'],
                                    warcprox_service['port'])
            self._frontier.update_site(site)
            self.logger.info(
                    'chose warcprox %s from service registry for site %s',
                    site.proxy, site)
            return site.proxy
        return None
    def _enable_warcprox_features(self, site):
        if site.enable_warcprox_features is not None:
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -139,3 +139,64 @@ def test_brozzle_site(httpd):
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload
 def test_warcprox_selection(httpd):
    ''' When enable_warcprox_features is true, brozzler is expected to choose
    and instance of warcprox '''
    test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/' % httpd.server_port
    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
    site = brozzler.Site(
            seed='http://localhost:%s/' % httpd.server_port,
            enable_warcprox_features=True,
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
    assert site.id is None
    r = rethinkstuff.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1
    # check proxy is set in rethink
    start = time.time()
    while not site.proxy and time.time() - start < 20:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.proxy[-5:] == ':8000'
    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'
    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port}
    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    # no screenshots of plaintext
    # check pywb
    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload