brozzler/tests/test_brozzling.py

#!/usr/bin/env python
'''
test_brozzling.py - XXX explain

Copyright (C) 2016 Internet Archive

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''

import pytest
import brozzler
import logging
import os
import http.server
import threading
import argparse
import urllib
import json

args = argparse.Namespace()
args.log_level = logging.INFO
brozzler.cli.configure_logging(args)

WARCPROX_META_420 = {
    'stats': {
        'test_limits_bucket': {
            'total': {'urls': 0, 'wire_bytes': 0},
            'new': {'urls': 0, 'wire_bytes': 0},
            'revisit': {'urls': 0, 'wire_bytes': 0},
            'bucket': 'test_limits_bucket'
        }
    },
    'reached-limit': {'test_limits_bucket/total/urls': 0}
}

@pytest.fixture(scope='module')
def httpd(request):
    class RequestHandler(http.server.SimpleHTTPRequestHandler):
        def do_GET(self):
            if self.path == '/420':
                self.send_response(420, 'Reached limit')
                self.send_header('Connection', 'close')
                self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))
                payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'
                self.send_header('Content-Type', 'text/plain;charset=utf-8')
                self.send_header('Content-Length', len(payload))
                self.end_headers()
                self.wfile.write(payload)
            else:
                super().do_GET()

    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))

    httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()

    def fin():
        httpd.shutdown()
        httpd.server_close()
        httpd_thread.join()
    request.addfinalizer(fin)

    return httpd

def test_httpd(httpd):
    '''
    Tests that our http server is working as expected, and that two fetches
    of the same url return the same payload, proving it can be used to test
    deduplication.
    '''
    payload1 = content2 = None
    url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
        payload1 = response.read()
        assert payload1

    with urllib.request.urlopen(url) as response:
        assert response.status == 200
        payload2 = response.read()
        assert payload2

    assert payload1 == payload2

    url = 'http://localhost:%s/420' % httpd.server_port
    with pytest.raises(urllib.error.HTTPError) as excinfo:
        urllib.request.urlopen(url)
    assert excinfo.value.getcode() == 420

def test_aw_snap_hes_dead_jim():
    chrome_exe = brozzler.suggest_default_chrome_exe()
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.BrowsingException):
            browser.browse_page('chrome://crash')

def test_on_response(httpd):
    response_urls = []
    def on_response(msg):
        response_urls.append(msg['params']['response']['url'])

    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site3/page.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(url, on_response=on_response)
    assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
    assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
    assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port

def test_420(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/420' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.ReachedLimit) as excinfo:
            browser.browse_page(url)
        assert excinfo.value.warcprox_meta == WARCPROX_META_420

def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
        browser.browse_page(
                'http://localhost:%s/site4/alert.html' % httpd.server_port)
        browser.browse_page(
                'http://localhost:%s/site4/confirm.html' % httpd.server_port)
        browser.browse_page(
                'http://localhost:%s/site4/prompt.html' % httpd.server_port)
        # XXX print dialog unresolved
        # browser.browse_page(
        #         'http://localhost:%s/site4/print.html' % httpd.server_port)

def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    chrome_exe = brozzler.suggest_default_chrome_exe()
    site = brozzler.Site(None, {})
    page = brozzler.Page(None, {
        'url':'http://localhost:%s/site6/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 2
    assert page.videos[0] == {
        'blame': 'youtube-dl',
        'response_code': 200,
        'content-length': 383631,
        'content-type': 'video/mp4',
        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
    }
    assert page.videos[1] == {
        'blame': 'browser',
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
        'response_code': 200,
        'content-length': 229455,
        'content-type': 'video/webm',
        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
    }
restore handling of "aw snap" or "he's dead jim" 2016-12-21 14:21:20 -08:00			`#!/usr/bin/env python`
			`'''`
			`test_brozzling.py - XXX explain`

			`Copyright (C) 2016 Internet Archive`

			`Licensed under the Apache License, Version 2.0 (the "License");`
			`you may not use this file except in compliance with the License.`
			`You may obtain a copy of the License at`

			`http://www.apache.org/licenses/LICENSE-2.0`

			`Unless required by applicable law or agreed to in writing, software`
			`distributed under the License is distributed on an "AS IS" BASIS,`
			`WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`See the License for the specific language governing permissions and`
			`limitations under the License.`
			`'''`

			`import pytest`
			`import brozzler`
restore support for on_response and on_request, with an automated test for on_response 2016-12-21 18:35:55 -08:00			`import logging`
			`import os`
			`import http.server`
			`import threading`
			`import argparse`
add import missing from test 2016-12-21 19:19:34 -08:00			`import urllib`
restore handling of 420 Reached limit, with a rudimentary test 2016-12-22 13:44:09 -08:00			`import json`
restore support for on_response and on_request, with an automated test for on_response 2016-12-21 18:35:55 -08:00
			`args = argparse.Namespace()`
			`args.log_level = logging.INFO`
missed a spot 2017-01-20 23:59:31 -08:00			`brozzler.cli.configure_logging(args)`
restore support for on_response and on_request, with an automated test for on_response 2016-12-21 18:35:55 -08:00
restore handling of 420 Reached limit, with a rudimentary test 2016-12-22 13:44:09 -08:00			`WARCPROX_META_420 = {`
			`'stats': {`
			`'test_limits_bucket': {`
			`'total': {'urls': 0, 'wire_bytes': 0},`
			`'new': {'urls': 0, 'wire_bytes': 0},`
			`'revisit': {'urls': 0, 'wire_bytes': 0},`
			`'bucket': 'test_limits_bucket'`
			`}`
			`},`
			`'reached-limit': {'test_limits_bucket/total/urls': 0}`
			`}`

restore support for on_response and on_request, with an automated test for on_response 2016-12-21 18:35:55 -08:00			`@pytest.fixture(scope='module')`
			`def httpd(request):`
restore handling of 420 Reached limit, with a rudimentary test 2016-12-22 13:44:09 -08:00			`class RequestHandler(http.server.SimpleHTTPRequestHandler):`
			`def do_GET(self):`
			`if self.path == '/420':`
			`self.send_response(420, 'Reached limit')`
			`self.send_header('Connection', 'close')`
			`self.send_header('Warcprox-Meta', json.dumps(WARCPROX_META_420))`
			`payload = b'request rejected by warcprox: reached limit test_limits_bucket/total/urls=0\n'`
			`self.send_header('Content-Type', 'text/plain;charset=utf-8')`
			`self.send_header('Content-Length', len(payload))`
			`self.end_headers()`
			`self.wfile.write(payload)`
			`else:`
			`super().do_GET()`

restore support for on_response and on_request, with an automated test for on_response 2016-12-21 18:35:55 -08:00			`# SimpleHTTPRequestHandler always uses CWD so we have to chdir`
			`os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))`

restore handling of 420 Reached limit, with a rudimentary test 2016-12-22 13:44:09 -08:00			`httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)`
restore support for on_response and on_request, with an automated test for on_response 2016-12-21 18:35:55 -08:00			`httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)`
			`httpd_thread.start()`

			`def fin():`
			`httpd.shutdown()`
			`httpd.server_close()`
			`httpd_thread.join()`
			`request.addfinalizer(fin)`

			`return httpd`

			`def test_httpd(httpd):`
			`'''`
			`Tests that our http server is working as expected, and that two fetches`
			`of the same url return the same payload, proving it can be used to test`
			`deduplication.`
			`'''`
			`payload1 = content2 = None`
			`url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port`
			`with urllib.request.urlopen(url) as response:`
			`assert response.status == 200`
			`payload1 = response.read()`
			`assert payload1`

			`with urllib.request.urlopen(url) as response:`
			`assert response.status == 200`
			`payload2 = response.read()`
			`assert payload2`

			`assert payload1 == payload2`
restore handling of "aw snap" or "he's dead jim" 2016-12-21 14:21:20 -08:00
restore handling of 420 Reached limit, with a rudimentary test 2016-12-22 13:44:09 -08:00			`url = 'http://localhost:%s/420' % httpd.server_port`
			`with pytest.raises(urllib.error.HTTPError) as excinfo:`
			`urllib.request.urlopen(url)`
			`assert excinfo.value.getcode() == 420`

restore handling of "aw snap" or "he's dead jim" 2016-12-21 14:21:20 -08:00			`def test_aw_snap_hes_dead_jim():`
			`chrome_exe = brozzler.suggest_default_chrome_exe()`
			`with brozzler.Browser(chrome_exe=chrome_exe) as browser:`
			`with pytest.raises(brozzler.BrowsingException):`
			`browser.browse_page('chrome://crash')`
restore support for on_response and on_request, with an automated test for on_response 2016-12-21 18:35:55 -08:00
			`def test_on_response(httpd):`
			`response_urls = []`
			`def on_response(msg):`
			`response_urls.append(msg['params']['response']['url'])`

			`chrome_exe = brozzler.suggest_default_chrome_exe()`
			`url = 'http://localhost:%s/site3/page.html' % httpd.server_port`
			`with brozzler.Browser(chrome_exe=chrome_exe) as browser:`
			`browser.browse_page(url, on_response=on_response)`
			`assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port`
			`assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port`
			`assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port`

restore handling of 420 Reached limit, with a rudimentary test 2016-12-22 13:44:09 -08:00			`def test_420(httpd):`
			`chrome_exe = brozzler.suggest_default_chrome_exe()`
			`url = 'http://localhost:%s/420' % httpd.server_port`
			`with brozzler.Browser(chrome_exe=chrome_exe) as browser:`
			`with pytest.raises(brozzler.ReachedLimit) as excinfo:`
			`browser.browse_page(url)`
			`assert excinfo.value.warcprox_meta == WARCPROX_META_420`
tests for dismissal of javascript dialogs (alert, prompt, confirm) 2017-01-13 11:46:42 -08:00
			`def test_js_dialogs(httpd):`
			`chrome_exe = brozzler.suggest_default_chrome_exe()`
			`url = 'http://localhost:%s/site4/alert.html' % httpd.server_port`
			`with brozzler.Browser(chrome_exe=chrome_exe) as browser:`
			`# before commit d2ed6b97a24 these would hang and eventually raise`
			`# brozzler.browser.BrowsingTimeout, which would cause this test to fail`
			`browser.browse_page(`
			`'http://localhost:%s/site4/alert.html' % httpd.server_port)`
			`browser.browse_page(`
			`'http://localhost:%s/site4/confirm.html' % httpd.server_port)`
			`browser.browse_page(`
			`'http://localhost:%s/site4/prompt.html' % httpd.server_port)`
			`# XXX print dialog unresolved`
			`# browser.browse_page(`
			`# 'http://localhost:%s/site4/print.html' % httpd.server_port)`

save info about embedded videos in page document in rethinkdb 2017-03-20 11:49:11 -07:00			`def test_page_videos(httpd):`
oops remove pdb call 2017-03-20 12:14:11 -07:00			`# test depends on behavior of youtube-dl and chromium, could fail and need`
			`# to be adjusted on youtube-dl or chromium updates`
save info about embedded videos in page document in rethinkdb 2017-03-20 11:49:11 -07:00			`chrome_exe = brozzler.suggest_default_chrome_exe()`
			`worker = brozzler.BrozzlerWorker(None)`
			`chrome_exe = brozzler.suggest_default_chrome_exe()`
			`site = brozzler.Site(None, {})`
			`page = brozzler.Page(None, {`
			`'url':'http://localhost:%s/site6/' % httpd.server_port})`
			`with brozzler.Browser(chrome_exe=chrome_exe) as browser:`
			`worker.brozzle_page(browser, site, page)`
			`assert page.videos`
			`assert len(page.videos) == 2`
			`assert page.videos[0] == {`
			`'blame': 'youtube-dl',`
			`'response_code': 200,`
			`'content-length': 383631,`
			`'content-type': 'video/mp4',`
			`'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,`
			`}`
			`assert page.videos[1] == {`
			`'blame': 'browser',`
			`# 'response_code': 206,`
			`# 'content-range': 'bytes 0-229454/229455',`
			`'response_code': 200,`
			`'content-length': 229455,`
			`'content-type': 'video/webm',`
			`'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,`
			`}`