chore: fixing more tests

This commit is contained in:
Adam Miller 2025-03-27 17:12:17 -07:00
parent b5ee8a9ea7
commit e7e4225bf2
6 changed files with 81 additions and 39 deletions

View File

@ -22,7 +22,6 @@ import logging
import threading
from importlib.metadata import version as _version
import doublethink
import structlog
import urlcanon
@ -399,7 +398,7 @@ def suggest_default_chrome_exe():
return "chromium-browser"
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=doublethink.UTC)
EPOCH_UTC = datetime.datetime.fromtimestamp(0.0, tz=datetime.timezone.utc)
from brozzler.browser import Browser, BrowserPool, BrowsingException # noqa: E402
from brozzler.robots import is_permitted_by_robots # noqa: E402

View File

@ -222,6 +222,7 @@ class Chrome:
chrome_args.append("--ignore-certificate-errors")
if proxy:
chrome_args.append("--proxy-server=%s" % proxy)
self.logger.info(f"Chrome launched with args {chrome_args} proxy is {proxy}")
chrome_args.append("about:blank")
self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args))
# start_new_session - new process group so we can kill the whole group

View File

@ -200,30 +200,30 @@ def test_page_videos(httpd):
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
worker.brozzle_page(browser, site, page)
assert page.videos
assert len(page.videos) == 4
assert len(page.videos) == 1
# assert page.videos[0] == {
# "blame": "youtube-dl",
# "response_code": 200,
# "content-length": 383631,
# "content-type": "video/mp4",
# "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
# }
# assert page.videos[1] == {
# "blame": "youtube-dl",
# "content-length": 92728,
# "content-type": "video/webm",
# "response_code": 200,
# "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
# % httpd.server_port,
# }
# assert page.videos[2] == {
# "blame": "youtube-dl",
# "content-length": 101114,
# "content-type": "video/webm",
# "response_code": 200,
# "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
# }
assert page.videos[0] == {
"blame": "youtube-dl",
"response_code": 200,
"content-length": 383631,
"content-type": "video/mp4",
"url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
}
assert page.videos[1] == {
"blame": "youtube-dl",
"content-length": 92728,
"content-type": "video/webm",
"response_code": 200,
"url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
% httpd.server_port,
}
assert page.videos[2] == {
"blame": "youtube-dl",
"content-length": 101114,
"content-type": "video/webm",
"response_code": 200,
"url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
}
assert page.videos[3] == {
"blame": "browser",
# 'response_code': 206,
# 'content-range': 'bytes 0-229454/229455',
@ -271,6 +271,8 @@ def test_proxy_down():
chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.stop() # We're manually instantiating the browser without arguments,
# so it is running without a proxy. Stop it first.
with pytest.raises(brozzler.ProxyError):
worker.brozzle_page(browser, site, page)

View File

@ -18,6 +18,7 @@ limitations under the License.
"""
import importlib.metadata
import os
import subprocess
import doublethink
@ -26,6 +27,13 @@ import pytest
import brozzler.cli
@pytest.fixture(scope="module")
def rethinker(request):
db = request.param if hasattr(request, "param") else "ignoreme"
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
return doublethink.Rethinker(db=db, servers=servers.split(","))
def console_scripts():
# We do a dict comprehension here because the select filters aren't
# available until Python 3.10's importlib.
@ -67,14 +75,18 @@ def test_run_command(capsys, cmd):
[cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
out, err = proc.communicate()
assert err == b""
# Remove lines from syntax warning in imported library
filtered_lines = [line for line in err.decode("utf-8").splitlines() if "reppy" not in line and
"re.compile" not in line]
assert filtered_lines == []
assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
def test_rethinkdb_up():
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True) # build-in db
def test_rethinkdb_up(rethinker):
"""Check that rethinkdb is up and running."""
# check that rethinkdb is listening and looks sane
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
rr = rethinker
tbls = rr.table_list().run()
assert len(tbls) > 10

View File

@ -40,10 +40,10 @@ brozzler.cli.configure_logging(args)
def rethinker(request):
db = request.param if hasattr(request, "param") else "ignoreme"
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
return doublethink.Rethinker(db=db, servers=servers.split(",")) # built-in db
return doublethink.Rethinker(db=db, servers=servers.split(","))
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True)
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True) # build-in db
def test_rethinkdb_up(rethinker):
"""Checks that rethinkdb is listening and looks sane."""
rr = rethinker
@ -269,7 +269,9 @@ def test_resume_job(rethinker):
site1 = list(frontier.job_sites(job.id))[0]
site2 = list(frontier.job_sites(job.id))[1]
job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
tzinfo=doublethink.UTC
)
job.save()
# should raise a CrawlStopped
@ -317,7 +319,9 @@ def test_resume_job(rethinker):
assert site2.starts_and_stops[1]["stop"] is None
# simulate a site stop request
site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
site1.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
tzinfo=doublethink.UTC
)
site1.save()
# should not raise a CrawlStopped
@ -849,7 +853,9 @@ def test_honor_stop_request(rethinker):
frontier.honor_stop_request(site)
# set job.stop_requested
job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
tzinfo=doublethink.UTC
)
job.save()
with pytest.raises(brozzler.CrawlStopped):
frontier.honor_stop_request(site)
@ -956,6 +962,18 @@ def test_max_claimed_sites(rethinker):
rr.table("jobs").delete().run()
rr.table("sites").delete().run()
job = brozzler.new_job(frontier, job_conf)
claimed_sites = frontier.claim_sites(2)
assert len(claimed_sites) == 2
claimed_sites = frontier.claim_sites(1)
assert len(claimed_sites) == 1
with pytest.raises(brozzler.NothingToClaim):
claimed_sites = frontier.claim_sites(1)
# clean slate for the next one
rr.table("jobs").delete().run()
rr.table("sites").delete().run()
def test_choose_warcprox(rethinker):
rr = rethinker

View File

@ -262,6 +262,21 @@ blocks:
# Some changes to the brozzler ydl interface not represented in this test
# https://github.com/internetarchive/brozzler/issues/330
@pytest.mark.xfail
def test_ydl_proxy_down():
sock = socket.socket()
sock.bind(("127.0.0.1", 0))
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
site = brozzler.Site(
None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
)
page = brozzler.Page(None, {"url": "http://example.com/"})
# youtube-dl fetch
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
with pytest.raises(brozzler.ProxyError):
brozzler.ydl.do_youtube_dl(worker, site, page)
def test_proxy_down():
"""
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
@ -288,11 +303,6 @@ def test_proxy_down():
site, "http://example.com/", proxy=not_listening_proxy
)
# youtube-dl fetch
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
with pytest.raises(brozzler.ProxyError):
brozzler.ydl.do_youtube_dl(worker, site, page)
# raw fetch
with pytest.raises(brozzler.ProxyError):
worker._fetch_url(site, page=page)
@ -557,7 +567,7 @@ def test_limit_failures():
site = mock.Mock()
site.status = "ACTIVE"
site.active_brozzling_time = 0
site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
site.starts_and_stops = [{"start": datetime.datetime.now(datetime.timezone.utc)}]
rr = mock.Mock()
rr.servers = [mock.Mock()]