Use warcprox if enable_warcprox_features is true

2025-08-02 03:26:11 -04:00 · 2016-10-18 17:39:33 -07:00 · 2016-10-18 17:39:33 -07:00 · 2215aaab21
commit 2215aaab21
parent a370e7b987
2 changed files with 79 additions and 5 deletions
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -18,20 +18,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 '''

-import os
 import logging
 import brozzler
 import brozzler.browser
 import threading
 import time
-import signal
 import youtube_dl
 import urllib.request
 import json
 import PIL.Image
 import io
 import socket
-import datetime
 import collections
 import requests
 import rethinkstuff
@ -114,7 +111,23 @@ class BrozzlerWorker:
        self._browsing_threads = set()

    def _proxy(self, site):
-        return site.proxy or self.__proxy
+        if site.proxy:
+            return site.proxy
+        elif self.__proxy:
+            return self.__proxy
+        elif self._service_registry and (
+                site.enable_warcprox_features or
+                self.__enable_warcprox_features):
+            warcprox_service = self._service_registry.available_service('warcprox')
+            site.proxy = '%s:%s' % (warcprox_service['host'],
+                                    warcprox_service['port'])
+            self._frontier.update_site(site)
+            self.logger.info(
+                    'chose warcprox %s from service registry for site %s',
+                    site.proxy, site)
+            return site.proxy
+        return None
+

    def _enable_warcprox_features(self, site):
        if site.enable_warcprox_features is not None:
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -121,7 +121,68 @@ def test_brozzle_site(httpd):
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
-            'http://localhost:%s/file1.txt' % httpd.server_port }
+            'http://localhost:%s/file1.txt' % httpd.server_port}
+
+    # take a look at the captures table
+    captures = r.table('captures').filter({'test_id':test_id}).run()
+    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
+    assert page1 in captures_by_url
+    assert '%srobots.txt' % page1 in captures_by_url
+    assert page2 in captures_by_url
+    assert 'screenshot:%s' % page1 in captures_by_url
+    assert 'thumbnail:%s' % page1 in captures_by_url
+    # no screenshots of plaintext
+
+    # check pywb
+    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
+    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
+    expected_payload = open(os.path.join(
+        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
+    assert requests.get(wb_url).content == expected_payload
+
+
+def test_warcprox_selection(httpd):
+    ''' When enable_warcprox_features is true, brozzler is expected to choose
+    and instance of warcprox '''
+
+    test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
+
+    # the two pages we expect to be crawled
+    page1 = 'http://localhost:%s/' % httpd.server_port
+    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
+
+    site = brozzler.Site(
+            seed='http://localhost:%s/' % httpd.server_port,
+            enable_warcprox_features=True,
+            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
+
+    assert site.id is None
+    r = rethinkstuff.Rethinker('localhost', db='brozzler')
+    frontier = brozzler.RethinkDbFrontier(r)
+    brozzler.new_site(frontier, site)
+    assert site.id is not None
+    assert len(list(frontier.site_pages(site.id))) == 1
+
+    # check proxy is set in rethink
+    start = time.time()
+    while not site.proxy and time.time() - start < 20:
+        time.sleep(0.5)
+        site = frontier.site(site.id)
+    assert site.proxy[-5:] == ':8000'
+
+    # the site should be brozzled fairly quickly
+    start = time.time()
+    while site.status != 'FINISHED' and time.time() - start < 300:
+        time.sleep(0.5)
+        site = frontier.site(site.id)
+    assert site.status == 'FINISHED'
+
+    # check that we got the two pages we expected
+    pages = list(frontier.site_pages(site.id))
+    assert len(pages) == 2
+    assert {page.url for page in pages} == {
+            'http://localhost:%s/' % httpd.server_port,
+            'http://localhost:%s/file1.txt' % httpd.server_port}

    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()