mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
chore: fixing more tests
This commit is contained in:
parent
b5ee8a9ea7
commit
e7e4225bf2
@ -22,7 +22,6 @@ import logging
|
||||
import threading
|
||||
from importlib.metadata import version as _version
|
||||
|
||||
import doublethink
|
||||
import structlog
|
||||
import urlcanon
|
||||
|
||||
@ -399,7 +398,7 @@ def suggest_default_chrome_exe():
|
||||
return "chromium-browser"
|
||||
|
||||
|
||||
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=doublethink.UTC)
|
||||
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
|
||||
|
||||
from brozzler.browser import Browser, BrowserPool, BrowsingException # noqa: E402
|
||||
from brozzler.robots import is_permitted_by_robots # noqa: E402
|
||||
|
@ -222,6 +222,7 @@ class Chrome:
|
||||
chrome_args.append("--ignore-certificate-errors")
|
||||
if proxy:
|
||||
chrome_args.append("--proxy-server=%s" % proxy)
|
||||
self.logger.info(f"Chrome launched with args {chrome_args} proxy is {proxy}")
|
||||
chrome_args.append("about:blank")
|
||||
self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args))
|
||||
# start_new_session - new process group so we can kill the whole group
|
||||
|
@ -200,30 +200,30 @@ def test_page_videos(httpd):
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
worker.brozzle_page(browser, site, page)
|
||||
assert page.videos
|
||||
assert len(page.videos) == 4
|
||||
assert len(page.videos) == 1
|
||||
# assert page.videos[0] == {
|
||||
# "blame": "youtube-dl",
|
||||
# "response_code": 200,
|
||||
# "content-length": 383631,
|
||||
# "content-type": "video/mp4",
|
||||
# "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
|
||||
# }
|
||||
# assert page.videos[1] == {
|
||||
# "blame": "youtube-dl",
|
||||
# "content-length": 92728,
|
||||
# "content-type": "video/webm",
|
||||
# "response_code": 200,
|
||||
# "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
|
||||
# % httpd.server_port,
|
||||
# }
|
||||
# assert page.videos[2] == {
|
||||
# "blame": "youtube-dl",
|
||||
# "content-length": 101114,
|
||||
# "content-type": "video/webm",
|
||||
# "response_code": 200,
|
||||
# "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
|
||||
# }
|
||||
assert page.videos[0] == {
|
||||
"blame": "youtube-dl",
|
||||
"response_code": 200,
|
||||
"content-length": 383631,
|
||||
"content-type": "video/mp4",
|
||||
"url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
|
||||
}
|
||||
assert page.videos[1] == {
|
||||
"blame": "youtube-dl",
|
||||
"content-length": 92728,
|
||||
"content-type": "video/webm",
|
||||
"response_code": 200,
|
||||
"url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
|
||||
% httpd.server_port,
|
||||
}
|
||||
assert page.videos[2] == {
|
||||
"blame": "youtube-dl",
|
||||
"content-length": 101114,
|
||||
"content-type": "video/webm",
|
||||
"response_code": 200,
|
||||
"url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
|
||||
}
|
||||
assert page.videos[3] == {
|
||||
"blame": "browser",
|
||||
# 'response_code': 206,
|
||||
# 'content-range': 'bytes 0-229454/229455',
|
||||
@ -271,6 +271,8 @@ def test_proxy_down():
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
browser.stop() # We're manually instantiating the browser without arguments,
|
||||
# so it is running without a proxy. Stop it first.
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker.brozzle_page(browser, site, page)
|
||||
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
"""
|
||||
|
||||
import importlib.metadata
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import doublethink
|
||||
@ -26,6 +27,13 @@ import pytest
|
||||
import brozzler.cli
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinker(request):
|
||||
db = request.param if hasattr(request, "param") else "ignoreme"
|
||||
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
|
||||
return doublethink.Rethinker(db=db, servers=servers.split(","))
|
||||
|
||||
|
||||
def console_scripts():
|
||||
# We do a dict comprehension here because the select filters aren't
|
||||
# available until Python 3.10's importlib.
|
||||
@ -67,14 +75,18 @@ def test_run_command(capsys, cmd):
|
||||
[cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
out, err = proc.communicate()
|
||||
assert err == b""
|
||||
# Remove lines from syntax warning in imported library
|
||||
filtered_lines = [line for line in err.decode("utf-8").splitlines() if "reppy" not in line and
|
||||
"re.compile" not in line]
|
||||
assert filtered_lines == []
|
||||
assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
|
||||
|
||||
|
||||
def test_rethinkdb_up():
|
||||
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True) # build-in db
|
||||
def test_rethinkdb_up(rethinker):
|
||||
"""Check that rethinkdb is up and running."""
|
||||
# check that rethinkdb is listening and looks sane
|
||||
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
||||
rr = rethinker
|
||||
tbls = rr.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
|
@ -40,10 +40,10 @@ brozzler.cli.configure_logging(args)
|
||||
def rethinker(request):
|
||||
db = request.param if hasattr(request, "param") else "ignoreme"
|
||||
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
|
||||
return doublethink.Rethinker(db=db, servers=servers.split(",")) # built-in db
|
||||
return doublethink.Rethinker(db=db, servers=servers.split(","))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True)
|
||||
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True) # build-in db
|
||||
def test_rethinkdb_up(rethinker):
|
||||
"""Checks that rethinkdb is listening and looks sane."""
|
||||
rr = rethinker
|
||||
@ -269,7 +269,9 @@ def test_resume_job(rethinker):
|
||||
site1 = list(frontier.job_sites(job.id))[0]
|
||||
site2 = list(frontier.job_sites(job.id))[1]
|
||||
|
||||
job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
||||
job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
|
||||
tzinfo=doublethink.UTC
|
||||
)
|
||||
job.save()
|
||||
|
||||
# should raise a CrawlStopped
|
||||
@ -317,7 +319,9 @@ def test_resume_job(rethinker):
|
||||
assert site2.starts_and_stops[1]["stop"] is None
|
||||
|
||||
# simulate a site stop request
|
||||
site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
||||
site1.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
|
||||
tzinfo=doublethink.UTC
|
||||
)
|
||||
site1.save()
|
||||
|
||||
# should not raise a CrawlStopped
|
||||
@ -849,7 +853,9 @@ def test_honor_stop_request(rethinker):
|
||||
frontier.honor_stop_request(site)
|
||||
|
||||
# set job.stop_requested
|
||||
job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
||||
job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
|
||||
tzinfo=doublethink.UTC
|
||||
)
|
||||
job.save()
|
||||
with pytest.raises(brozzler.CrawlStopped):
|
||||
frontier.honor_stop_request(site)
|
||||
@ -956,6 +962,18 @@ def test_max_claimed_sites(rethinker):
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job = brozzler.new_job(frontier, job_conf)
|
||||
claimed_sites = frontier.claim_sites(2)
|
||||
assert len(claimed_sites) == 2
|
||||
claimed_sites = frontier.claim_sites(1)
|
||||
assert len(claimed_sites) == 1
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
claimed_sites = frontier.claim_sites(1)
|
||||
|
||||
# clean slate for the next one
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
|
||||
def test_choose_warcprox(rethinker):
|
||||
rr = rethinker
|
||||
|
@ -262,6 +262,21 @@ blocks:
|
||||
# Some changes to the brozzler ydl interface not represented in this test
|
||||
# https://github.com/internetarchive/brozzler/issues/330
|
||||
@pytest.mark.xfail
|
||||
def test_ydl_proxy_down():
|
||||
sock = socket.socket()
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||
site = brozzler.Site(
|
||||
None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
|
||||
)
|
||||
page = brozzler.Page(None, {"url": "http://example.com/"})
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
def test_proxy_down():
|
||||
"""
|
||||
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||
@ -288,11 +303,6 @@ def test_proxy_down():
|
||||
site, "http://example.com/", proxy=not_listening_proxy
|
||||
)
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
# raw fetch
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker._fetch_url(site, page=page)
|
||||
@ -557,7 +567,7 @@ def test_limit_failures():
|
||||
site = mock.Mock()
|
||||
site.status = "ACTIVE"
|
||||
site.active_brozzling_time = 0
|
||||
site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
|
||||
site.starts_and_stops = [{"start": datetime.datetime.now(datetime.timezone.utc)}]
|
||||
|
||||
rr = mock.Mock()
|
||||
rr.servers = [mock.Mock()]
|
||||
|
Loading…
x
Reference in New Issue
Block a user