mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
Merge d36313f08fd1986c8fc637fde5a61b1039c3c3ef into 42b4a88c963eb480b3c19117c19eac7e146fa8ff
This commit is contained in:
commit
34c9290a85
@ -223,7 +223,9 @@ class Chrome:
|
||||
if proxy:
|
||||
chrome_args.append("--proxy-server=%s" % proxy)
|
||||
chrome_args.append("about:blank")
|
||||
self.logger.info("running", chrome_args=subprocess.list2cmdline(chrome_args))
|
||||
self.logger.info(
|
||||
"running", chrome_args=subprocess.list2cmdline(chrome_args), proxy=proxy
|
||||
)
|
||||
# start_new_session - new process group so we can kill the whole group
|
||||
self.chrome_process = subprocess.Popen(
|
||||
chrome_args,
|
||||
|
@ -16,6 +16,9 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
from typing import Dict, List
|
||||
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
import structlog
|
||||
@ -30,6 +33,62 @@ class UnexpectedDbResult(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def filter_claimable_site_ids(
|
||||
active_sites: List[Dict],
|
||||
reclaim_cooldown: int,
|
||||
max_sites_to_claim=1,
|
||||
) -> List[str]:
|
||||
job_counts = {}
|
||||
claimable_sites = []
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
|
||||
for site in active_sites:
|
||||
is_claimable = False
|
||||
|
||||
# If site not claimed and not disclaimed within last 20 seconds
|
||||
if not site["claimed"] and site.get("last_disclaimed", 0) <= (
|
||||
now - datetime.timedelta(seconds=reclaim_cooldown)
|
||||
):
|
||||
is_claimable = True
|
||||
|
||||
# or site has been disclaimed more than an hour ago
|
||||
if "last_claimed" in site and site["last_claimed"] <= (
|
||||
now - datetime.timedelta(hours=1)
|
||||
):
|
||||
is_claimable = True
|
||||
|
||||
# Count number of claimed sites per job_id (optional field)
|
||||
if site["claimed"] and "max_claimed_sites" in site and "job_id" in site:
|
||||
job_id = site["job_id"]
|
||||
job_counts[job_id] = job_counts.get(job_id, 0) + 1
|
||||
|
||||
if is_claimable:
|
||||
claimable_sites.append(site)
|
||||
|
||||
if len(claimable_sites) >= max_sites_to_claim:
|
||||
break
|
||||
|
||||
site_ids_to_claim = []
|
||||
# gather sites that are under the max without going over
|
||||
for site in claimable_sites:
|
||||
if (
|
||||
"max_claimed_sites" in site
|
||||
and "job_id" in site
|
||||
and job_counts.get(site["job_id"], 0) < site["max_claimed_sites"]
|
||||
):
|
||||
site_ids_to_claim.append(site["id"])
|
||||
job_counts[site["job_id"]] = job_counts.get(site["job_id"], 0) + 1
|
||||
|
||||
if "max_claimed_sites" not in site or "job_id" not in site:
|
||||
site_ids_to_claim.append(site["id"])
|
||||
|
||||
# short circuit if we already have more than requested
|
||||
if len(site_ids_to_claim) >= max_sites_to_claim:
|
||||
break
|
||||
|
||||
return site_ids_to_claim
|
||||
|
||||
|
||||
class RethinkDbFrontier:
|
||||
logger = structlog.get_logger(logger_name=__module__ + "." + __qualname__)
|
||||
|
||||
@ -101,68 +160,37 @@ class RethinkDbFrontier:
|
||||
"expected %r to be %r in %r" % (k, expected, result)
|
||||
)
|
||||
|
||||
def claim_sites(self, n=1):
|
||||
self.logger.debug("claiming up to %s sites to brozzle", n)
|
||||
result = (
|
||||
self.rr.table("sites")
|
||||
.get_all(
|
||||
r.args(
|
||||
r.db(self.rr.dbname)
|
||||
.table("sites", read_mode="majority")
|
||||
.between(
|
||||
["ACTIVE", r.minval],
|
||||
["ACTIVE", r.maxval],
|
||||
index="sites_last_disclaimed",
|
||||
)
|
||||
.order_by(r.desc("claimed"), "last_disclaimed")
|
||||
.fold( # apply functions to sequence
|
||||
{},
|
||||
lambda acc,
|
||||
site: acc.merge( # add the following to the accumulator
|
||||
r.branch( # if has job_id
|
||||
site.has_fields("job_id"),
|
||||
r.object( # then add this: key is stringified job_id,
|
||||
# value starts at 0, but is incremented each time a site with
|
||||
# the same job_id shows up in the result set. Used to get a
|
||||
# value of how many sites for any given job are active
|
||||
site["job_id"].coerce_to("string"),
|
||||
acc[site["job_id"].coerce_to("string")]
|
||||
.default(0)
|
||||
.add(1),
|
||||
),
|
||||
{}, # else add nothing
|
||||
)
|
||||
),
|
||||
emit=lambda acc, site, new_acc: r.branch( # big if conditional
|
||||
r.and_(
|
||||
r.or_(
|
||||
# Avoid tight loop when unclaimed site was recently disclaimed
|
||||
# Not claimed and not disclaimed within last 20 seconds
|
||||
r.and_(
|
||||
site["claimed"].not_(),
|
||||
r.or_(
|
||||
site.has_fields("last_disclaimed").not_(),
|
||||
site["last_disclaimed"].lt(r.now().sub(20)),
|
||||
),
|
||||
),
|
||||
# or last claimed over 1 hour ago
|
||||
site["last_claimed"].lt(r.now().sub(60 * 60)),
|
||||
),
|
||||
# and either max_claimed_sites isn't set, or not exceeded
|
||||
r.or_(
|
||||
site.has_fields("max_claimed_sites").not_(),
|
||||
new_acc[site["job_id"].coerce_to("string")].le(
|
||||
site["max_claimed_sites"]
|
||||
),
|
||||
),
|
||||
),
|
||||
[site["id"]], # then return this
|
||||
[], # else nothing
|
||||
),
|
||||
)
|
||||
.limit(n) # trim results to max we want
|
||||
)
|
||||
def get_active_sites(self) -> List[Dict]:
|
||||
active_sites = (
|
||||
self.rr.table("sites", read_mode="majority")
|
||||
.between(
|
||||
["ACTIVE", r.minval],
|
||||
["ACTIVE", r.maxval],
|
||||
index="sites_last_disclaimed",
|
||||
)
|
||||
.pluck(
|
||||
"id",
|
||||
"last_disclaimed",
|
||||
"claimed",
|
||||
"last_claimed",
|
||||
"job_id",
|
||||
"max_claimed_sites",
|
||||
)
|
||||
.order_by(r.desc("claimed"), "last_disclaimed")
|
||||
.run()
|
||||
)
|
||||
return active_sites
|
||||
|
||||
def claim_sites(self, n=1, reclaim_cooldown=20) -> List[Dict]:
|
||||
self.logger.debug("claiming up to %s sites to brozzle", n)
|
||||
|
||||
active_sites = self.get_active_sites()
|
||||
site_ids_to_claim = filter_claimable_site_ids(
|
||||
active_sites, reclaim_cooldown, max_sites_to_claim=n
|
||||
)
|
||||
result = (
|
||||
self.rr.table("sites", read_mode="majority")
|
||||
.get_all(r.args(site_ids_to_claim))
|
||||
.update( # mark the sites we're claiming, and return changed sites (our final claim
|
||||
# results)
|
||||
#
|
||||
|
@ -200,30 +200,30 @@ def test_page_videos(httpd):
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
worker.brozzle_page(browser, site, page)
|
||||
assert page.videos
|
||||
assert len(page.videos) == 4
|
||||
assert len(page.videos) == 1
|
||||
# assert page.videos[0] == {
|
||||
# "blame": "youtube-dl",
|
||||
# "response_code": 200,
|
||||
# "content-length": 383631,
|
||||
# "content-type": "video/mp4",
|
||||
# "url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
|
||||
# }
|
||||
# assert page.videos[1] == {
|
||||
# "blame": "youtube-dl",
|
||||
# "content-length": 92728,
|
||||
# "content-type": "video/webm",
|
||||
# "response_code": 200,
|
||||
# "url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
|
||||
# % httpd.server_port,
|
||||
# }
|
||||
# assert page.videos[2] == {
|
||||
# "blame": "youtube-dl",
|
||||
# "content-length": 101114,
|
||||
# "content-type": "video/webm",
|
||||
# "response_code": 200,
|
||||
# "url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
|
||||
# }
|
||||
assert page.videos[0] == {
|
||||
"blame": "youtube-dl",
|
||||
"response_code": 200,
|
||||
"content-length": 383631,
|
||||
"content-type": "video/mp4",
|
||||
"url": "http://localhost:%s/site6/small.mp4" % httpd.server_port,
|
||||
}
|
||||
assert page.videos[1] == {
|
||||
"blame": "youtube-dl",
|
||||
"content-length": 92728,
|
||||
"content-type": "video/webm",
|
||||
"response_code": 200,
|
||||
"url": "http://localhost:%s/site6/small-video_280x160_100k.webm"
|
||||
% httpd.server_port,
|
||||
}
|
||||
assert page.videos[2] == {
|
||||
"blame": "youtube-dl",
|
||||
"content-length": 101114,
|
||||
"content-type": "video/webm",
|
||||
"response_code": 200,
|
||||
"url": "http://localhost:%s/site6/small-audio.webm" % httpd.server_port,
|
||||
}
|
||||
assert page.videos[3] == {
|
||||
"blame": "browser",
|
||||
# 'response_code': 206,
|
||||
# 'content-range': 'bytes 0-229454/229455',
|
||||
@ -271,6 +271,8 @@ def test_proxy_down():
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
browser.stop() # We're manually instantiating the browser without arguments,
|
||||
# so it is running without a proxy. Stop it first.
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker.brozzle_page(browser, site, page)
|
||||
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
"""
|
||||
|
||||
import importlib.metadata
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import doublethink
|
||||
@ -26,6 +27,13 @@ import pytest
|
||||
import brozzler.cli
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinker(request):
|
||||
db = request.param if hasattr(request, "param") else "ignoreme"
|
||||
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
|
||||
return doublethink.Rethinker(db=db, servers=servers.split(","))
|
||||
|
||||
|
||||
def console_scripts():
|
||||
# We do a dict comprehension here because the select filters aren't
|
||||
# available until Python 3.10's importlib.
|
||||
@ -67,14 +75,21 @@ def test_run_command(capsys, cmd):
|
||||
[cmd, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
out, err = proc.communicate()
|
||||
assert err == b""
|
||||
# Remove lines from syntax warning in imported library
|
||||
filtered_lines = [
|
||||
line
|
||||
for line in err.decode("utf-8").splitlines()
|
||||
if "reppy" not in line and "re.compile" not in line
|
||||
]
|
||||
assert filtered_lines == []
|
||||
assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
|
||||
|
||||
|
||||
def test_rethinkdb_up():
|
||||
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True) # build-in db
|
||||
def test_rethinkdb_up(rethinker):
|
||||
"""Check that rethinkdb is up and running."""
|
||||
# check that rethinkdb is listening and looks sane
|
||||
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
||||
rr = rethinker
|
||||
tbls = rr.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
|
@ -68,6 +68,13 @@ def stop_service(service):
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinker(request):
|
||||
db = request.param if hasattr(request, "param") else "ignoreme"
|
||||
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
|
||||
return doublethink.Rethinker(db=db, servers=servers.split(","))
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def httpd(request):
|
||||
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
@ -162,10 +169,11 @@ def test_httpd(httpd):
|
||||
assert payload1 == payload2
|
||||
|
||||
|
||||
def test_services_up():
|
||||
@pytest.mark.skip()
|
||||
def test_services_up(rethinker):
|
||||
"""Check that the expected services are up and running."""
|
||||
# check that rethinkdb is listening and looks sane
|
||||
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
||||
rr = rethinker
|
||||
tbls = rr.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
@ -185,9 +193,11 @@ def test_services_up():
|
||||
s.connect(("localhost", 8881))
|
||||
|
||||
|
||||
def test_brozzle_site(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_brozzle_site(httpd, rethinker):
|
||||
test_id = "test_brozzle_site-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
{
|
||||
@ -262,6 +272,7 @@ def test_brozzle_site(httpd):
|
||||
assert response.headers["content-type"] == "image/jpeg"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="expects warcprox daemon running")
|
||||
def test_proxy_warcprox(httpd):
|
||||
"""Test --proxy with proxy that happens to be warcprox"""
|
||||
try:
|
||||
@ -273,6 +284,7 @@ def test_proxy_warcprox(httpd):
|
||||
start_service("brozzler-worker")
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="expects warcprox daemon running")
|
||||
def test_proxy_non_warcprox(httpd):
|
||||
"""Test --proxy with proxy that happens not to be warcprox"""
|
||||
|
||||
@ -331,6 +343,7 @@ def test_proxy_non_warcprox(httpd):
|
||||
th.join()
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_no_proxy(httpd):
|
||||
try:
|
||||
stop_service("brozzler-worker")
|
||||
@ -340,6 +353,7 @@ def test_no_proxy(httpd):
|
||||
# XXX how to check that no proxy was used?
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_warcprox_auto(httpd):
|
||||
"""Test --warcprox-auto"""
|
||||
try:
|
||||
@ -349,6 +363,7 @@ def test_warcprox_auto(httpd):
|
||||
start_service("brozzler-worker")
|
||||
|
||||
|
||||
@pytest.mark.skip()
|
||||
def test_proxy_conflict():
|
||||
with pytest.raises(AssertionError):
|
||||
brozzler.worker.BrozzlerWorker(
|
||||
@ -356,7 +371,11 @@ def test_proxy_conflict():
|
||||
)
|
||||
|
||||
|
||||
def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
|
||||
@pytest.mark.skip()
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
def _test_proxy_setting(
|
||||
httpd, rethinker, proxy=None, warcprox_auto=False, is_warcprox=False
|
||||
):
|
||||
test_id = "test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s" % (
|
||||
proxy,
|
||||
warcprox_auto,
|
||||
@ -369,7 +388,7 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
|
||||
page2 = make_url(httpd, "/site1/file1.txt")
|
||||
robots = make_url(httpd, "/robots.txt")
|
||||
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
service_registry = doublethink.ServiceRegistry(rr)
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -440,9 +459,11 @@ def _test_proxy_setting(httpd, proxy=None, warcprox_auto=False, is_warcprox=Fals
|
||||
assert captures_by_url == {}
|
||||
|
||||
|
||||
def test_obey_robots(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_obey_robots(httpd, rethinker):
|
||||
test_id = "test_obey_robots-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
{
|
||||
@ -497,9 +518,11 @@ def test_obey_robots(httpd):
|
||||
assert requests.get(wb_url, allow_redirects=False).content == expected_payload
|
||||
|
||||
|
||||
def test_login(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_login(httpd, rethinker):
|
||||
test_id = "test_login-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
{
|
||||
@ -550,9 +573,11 @@ def test_login(httpd):
|
||||
) in meth_url
|
||||
|
||||
|
||||
def test_seed_redirect(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_seed_redirect(httpd, rethinker):
|
||||
test_id = "test_seed_redirect-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
seed_url = make_url(httpd, "/site5/redirect/")
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -606,9 +631,11 @@ def test_seed_redirect(httpd):
|
||||
}
|
||||
|
||||
|
||||
def test_hashtags(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_hashtags(httpd, rethinker):
|
||||
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
seed_url = make_url(httpd, "/site7/")
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -660,9 +687,11 @@ def test_hashtags(httpd):
|
||||
assert "thumbnail:%s" % make_url(httpd, "/site7/foo.html") in captures_by_url
|
||||
|
||||
|
||||
def test_redirect_hashtags(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_redirect_hashtags(httpd, rethinker):
|
||||
test_id = "test_hashtags-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
seed_url = make_url(httpd, "/site9/")
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -727,8 +756,10 @@ def test_redirect_hashtags(httpd):
|
||||
# 14. WARCPROX_WRITE_RECORD thumbnail:http://localhost:41243/site9/redirect.html
|
||||
|
||||
|
||||
def test_stop_crawl(httpd):
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_stop_crawl(httpd, rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# create a new job with three sites that could be crawled forever
|
||||
@ -787,7 +818,9 @@ def test_stop_crawl(httpd):
|
||||
assert sites[2].status == "FINISHED_STOP_REQUESTED"
|
||||
|
||||
|
||||
def test_warcprox_outage_resiliency(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_warcprox_outage_resiliency(httpd, rethinker):
|
||||
"""
|
||||
Tests resiliency to warcprox outage.
|
||||
|
||||
@ -799,7 +832,7 @@ def test_warcprox_outage_resiliency(httpd):
|
||||
|
||||
If all instances of warcprox go down, brozzler-worker should sit and wait.
|
||||
"""
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# run two instances of warcprox
|
||||
@ -912,8 +945,10 @@ def test_warcprox_outage_resiliency(httpd):
|
||||
start_service("warcprox")
|
||||
|
||||
|
||||
def test_time_limit(httpd):
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_time_limit(httpd, rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# create a new job with one seed that could be crawled forever
|
||||
@ -940,9 +975,11 @@ def test_time_limit(httpd):
|
||||
assert job.status == "FINISHED"
|
||||
|
||||
|
||||
def test_ydl_stitching(httpd):
|
||||
@pytest.mark.parametrize("rethinker", ["brozzler"], indirect=True)
|
||||
@pytest.mark.skip(reason="expects brozzler worker daemon running")
|
||||
def test_ydl_stitching(httpd, rethinker):
|
||||
test_id = "test_ydl_stitching-%s" % datetime.datetime.utcnow().isoformat()
|
||||
rr = doublethink.Rethinker("localhost", db="brozzler")
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
|
@ -20,7 +20,9 @@ limitations under the License.
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
|
||||
import doublethink
|
||||
@ -35,15 +37,23 @@ args.log_level = logging.INFO
|
||||
brozzler.cli.configure_logging(args)
|
||||
|
||||
|
||||
def test_rethinkdb_up():
|
||||
@pytest.fixture(scope="module")
|
||||
def rethinker(request):
|
||||
db = request.param if hasattr(request, "param") else "ignoreme"
|
||||
servers = os.environ.get("BROZZLER_RETHINKDB_SERVERS", "localhost")
|
||||
return doublethink.Rethinker(db=db, servers=servers.split(","))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("rethinker", ["rethinkdb"], indirect=True) # build-in db
|
||||
def test_rethinkdb_up(rethinker):
|
||||
"""Checks that rethinkdb is listening and looks sane."""
|
||||
rr = doublethink.Rethinker(db="rethinkdb") # built-in db
|
||||
rr = rethinker
|
||||
tbls = rr.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
|
||||
def test_basics():
|
||||
rr = doublethink.Rethinker(db="ignoreme")
|
||||
def test_basics(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
job_conf = {
|
||||
"seeds": [{"url": "http://example.com"}, {"url": "https://example.org/"}]
|
||||
@ -73,6 +83,7 @@ def test_basics():
|
||||
"last_disclaimed": brozzler.EPOCH_UTC,
|
||||
"scope": {"accepts": [{"ssurt": "com,example,//http:/"}]},
|
||||
"seed": "http://example.com",
|
||||
"skip_ytdlp": None,
|
||||
"starts_and_stops": [
|
||||
{"start": sites[0].starts_and_stops[0]["start"], "stop": None}
|
||||
],
|
||||
@ -86,6 +97,7 @@ def test_basics():
|
||||
"last_disclaimed": brozzler.EPOCH_UTC,
|
||||
"scope": {"accepts": [{"ssurt": "org,example,//https:/"}]},
|
||||
"seed": "https://example.org/",
|
||||
"skip_ytdlp": None,
|
||||
"starts_and_stops": [
|
||||
{
|
||||
"start": sites[1].starts_and_stops[0]["start"],
|
||||
@ -100,28 +112,36 @@ def test_basics():
|
||||
assert pages[0] == {
|
||||
"brozzle_count": 0,
|
||||
"claimed": False,
|
||||
"failed_attempts": 0,
|
||||
"hop_path": None,
|
||||
"hops_from_seed": 0,
|
||||
"hops_off": 0,
|
||||
"id": brozzler.Page.compute_id(sites[0].id, "http://example.com"),
|
||||
"job_id": job.id,
|
||||
"needs_robots_check": True,
|
||||
"priority": 1000,
|
||||
"retry_after": None,
|
||||
"site_id": sites[0].id,
|
||||
"url": "http://example.com",
|
||||
"via_page_url": None,
|
||||
}
|
||||
pages = list(frontier.site_pages(sites[1].id))
|
||||
assert len(pages) == 1
|
||||
assert pages[0] == {
|
||||
"brozzle_count": 0,
|
||||
"claimed": False,
|
||||
"failed_attempts": 0,
|
||||
"hop_path": None,
|
||||
"hops_from_seed": 0,
|
||||
"hops_off": 0,
|
||||
"id": brozzler.Page.compute_id(sites[1].id, "https://example.org/"),
|
||||
"job_id": job.id,
|
||||
"needs_robots_check": True,
|
||||
"priority": 1000,
|
||||
"retry_after": None,
|
||||
"site_id": sites[1].id,
|
||||
"url": "https://example.org/",
|
||||
"via_page_url": None,
|
||||
}
|
||||
|
||||
# test "brozzled" parameter of frontier.site_pages
|
||||
@ -140,13 +160,13 @@ def test_basics():
|
||||
assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
|
||||
|
||||
|
||||
def test_resume_job():
|
||||
def test_resume_job(rethinker):
|
||||
"""
|
||||
Tests that the right stuff gets twiddled in rethinkdb when we "start" and
|
||||
"finish" crawling a job. Doesn't actually crawl anything.
|
||||
"""
|
||||
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
||||
rr = doublethink.Rethinker(db="ignoreme")
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
job_conf = {"seeds": [{"url": "http://example.com/"}]}
|
||||
job = brozzler.new_job(frontier, job_conf)
|
||||
@ -250,7 +270,9 @@ def test_resume_job():
|
||||
site1 = list(frontier.job_sites(job.id))[0]
|
||||
site2 = list(frontier.job_sites(job.id))[1]
|
||||
|
||||
job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
||||
job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
|
||||
tzinfo=doublethink.UTC
|
||||
)
|
||||
job.save()
|
||||
|
||||
# should raise a CrawlStopped
|
||||
@ -298,7 +320,9 @@ def test_resume_job():
|
||||
assert site2.starts_and_stops[1]["stop"] is None
|
||||
|
||||
# simulate a site stop request
|
||||
site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
||||
site1.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
|
||||
tzinfo=doublethink.UTC
|
||||
)
|
||||
site1.save()
|
||||
|
||||
# should not raise a CrawlStopped
|
||||
@ -343,12 +367,12 @@ def test_resume_job():
|
||||
assert site2.starts_and_stops[1]["stop"] is None
|
||||
|
||||
|
||||
def test_time_limit():
|
||||
def test_time_limit(rethinker):
|
||||
# XXX test not thoroughly adapted to change in time accounting, since
|
||||
# starts_and_stops is no longer used to enforce time limits
|
||||
|
||||
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(rr, {"seed": "http://example.com/", "time_limit": 99999})
|
||||
brozzler.new_site(frontier, site)
|
||||
@ -395,8 +419,8 @@ def test_time_limit():
|
||||
frontier.enforce_time_limit(site)
|
||||
|
||||
|
||||
def test_field_defaults():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_field_defaults(rethinker):
|
||||
rr = rethinker
|
||||
|
||||
# page
|
||||
brozzler.Page.table_ensure(rr)
|
||||
@ -466,8 +490,8 @@ def test_field_defaults():
|
||||
assert kob.starts_and_stops
|
||||
|
||||
|
||||
def test_scope_and_schedule_outlinks():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_scope_and_schedule_outlinks(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(rr, {"seed": "http://example.com/"})
|
||||
parent_page = brozzler.Page(
|
||||
@ -510,8 +534,8 @@ def test_scope_and_schedule_outlinks():
|
||||
assert brozzler.Page.load(rr, id)
|
||||
|
||||
|
||||
def test_parent_url_scoping():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_parent_url_scoping(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# scope rules that look at parent page url should consider both the
|
||||
@ -624,8 +648,8 @@ def test_parent_url_scoping():
|
||||
assert parent_page.outlinks["accepted"] == []
|
||||
|
||||
|
||||
def test_completed_page():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_completed_page(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# redirect that changes scope surt
|
||||
@ -718,8 +742,8 @@ def test_completed_page():
|
||||
assert page.claimed is False
|
||||
|
||||
|
||||
def test_seed_page():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_seed_page(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
site = brozzler.Site(rr, {"seed": "http://example.com/a/"})
|
||||
@ -742,8 +766,8 @@ def test_seed_page():
|
||||
assert frontier.seed_page(site.id) == page0
|
||||
|
||||
|
||||
def test_hashtag_seed():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_hashtag_seed(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# no hash tag
|
||||
@ -771,8 +795,8 @@ def test_hashtag_seed():
|
||||
]
|
||||
|
||||
|
||||
def test_hashtag_links():
|
||||
rr = doublethink.Rethinker("localhost", db="test_hashtag_links")
|
||||
def test_hashtag_links(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
site = brozzler.Site(rr, {"seed": "http://example.org/"})
|
||||
@ -813,8 +837,8 @@ def test_hashtag_links():
|
||||
assert pages[2].priority == 12
|
||||
|
||||
|
||||
def test_honor_stop_request():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_honor_stop_request(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# 1. test stop request on job
|
||||
@ -830,7 +854,9 @@ def test_honor_stop_request():
|
||||
frontier.honor_stop_request(site)
|
||||
|
||||
# set job.stop_requested
|
||||
job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC)
|
||||
job.stop_requested = datetime.datetime.now(datetime.timezone.utc).replace(
|
||||
tzinfo=doublethink.UTC
|
||||
)
|
||||
job.save()
|
||||
with pytest.raises(brozzler.CrawlStopped):
|
||||
frontier.honor_stop_request(site)
|
||||
@ -854,8 +880,8 @@ def test_honor_stop_request():
|
||||
frontier.honor_stop_request(site)
|
||||
|
||||
|
||||
def test_claim_site():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_claim_site(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
rr.table("sites").delete().run() # clean slate
|
||||
@ -897,10 +923,10 @@ def test_claim_site():
|
||||
rr.table("sites").get(claimed_site.id).delete().run()
|
||||
|
||||
|
||||
def test_max_claimed_sites():
|
||||
def test_max_claimed_sites(rethinker):
|
||||
# max_claimed_sites is a brozzler job setting that puts a cap on the number
|
||||
# of the job's sites that can be brozzled simultaneously across the cluster
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# clean slate
|
||||
@ -908,6 +934,7 @@ def test_max_claimed_sites():
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job_conf = {
|
||||
"id": 1,
|
||||
"seeds": [
|
||||
{"url": "http://example.com/1"},
|
||||
{"url": "http://example.com/2"},
|
||||
@ -917,7 +944,7 @@ def test_max_claimed_sites():
|
||||
],
|
||||
"max_claimed_sites": 3,
|
||||
}
|
||||
|
||||
seeds_seen = []
|
||||
job = brozzler.new_job(frontier, job_conf)
|
||||
|
||||
assert job.id
|
||||
@ -938,8 +965,127 @@ def test_max_claimed_sites():
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
|
||||
def test_choose_warcprox():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_max_claimed_sites_cross_job(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# clean slate
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job_conf_1 = {
|
||||
"id": 1,
|
||||
"seeds": [
|
||||
{"url": "http://example.com/1"},
|
||||
{"url": "http://example.com/2"},
|
||||
{"url": "http://example.com/3"},
|
||||
{"url": "http://example.com/4"},
|
||||
{"url": "http://example.com/5"},
|
||||
],
|
||||
"max_claimed_sites": 3,
|
||||
}
|
||||
job_conf_2 = {
|
||||
"id": 2,
|
||||
"seeds": [
|
||||
{"url": "http://example.com/6"},
|
||||
{"url": "http://example.com/7"},
|
||||
{"url": "http://example.com/8"},
|
||||
{"url": "http://example.com/9"},
|
||||
{"url": "http://example.com/10"},
|
||||
],
|
||||
"max_claimed_sites": 3,
|
||||
}
|
||||
|
||||
seeds_seen = []
|
||||
job_1 = brozzler.new_job(frontier, job_conf_1)
|
||||
job_2 = brozzler.new_job(frontier, job_conf_2)
|
||||
|
||||
assert len(list(frontier.job_sites(job_1.id))) == 5
|
||||
assert len(list(frontier.job_sites(job_2.id))) == 5
|
||||
|
||||
claimed_sites_1 = frontier.claim_sites(4)
|
||||
assert len(claimed_sites_1) == 4
|
||||
|
||||
sites_per_job = {}
|
||||
for site in claimed_sites_1:
|
||||
sites_per_job[site["job_id"]] = sites_per_job.get(site["job_id"], 0) + 1
|
||||
|
||||
# 2 jobs, max of 3 each.
|
||||
assert len(sites_per_job.keys()) == 2
|
||||
assert sites_per_job[1] + sites_per_job[2] == 4
|
||||
assert sites_per_job[1] <= 3 and sites_per_job[2] <= 3
|
||||
|
||||
# 6 sites left in queue, but only 2 are still claimable due to max
|
||||
claimed_sites_2 = frontier.claim_sites(6)
|
||||
assert len(claimed_sites_2) == 2
|
||||
|
||||
# disclaim sites
|
||||
for site in itertools.chain(claimed_sites_1, claimed_sites_2):
|
||||
frontier.disclaim_site(site)
|
||||
seeds_seen.append(site["seed"])
|
||||
|
||||
# Only 4 sites left in queue, that aren't recently claimed
|
||||
claimed_sites_3 = frontier.claim_sites(6)
|
||||
assert len(claimed_sites_3) == 4
|
||||
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
claimed_sites = frontier.claim_sites(1)
|
||||
assert len(claimed_sites) == 1
|
||||
|
||||
for site in claimed_sites_3:
|
||||
seeds_seen.append(site["seed"])
|
||||
|
||||
# ensure all sites have been claimed at this point
|
||||
for seed in itertools.chain(job_conf_1["seeds"], job_conf_2["seeds"]):
|
||||
assert seed["url"] in seeds_seen
|
||||
|
||||
# All unclaimed sites have been recently disclaimed and are not claimable
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
frontier.claim_sites(3)
|
||||
|
||||
# Disable reclaim cooldown. With 4 claimed, we should have 2 available
|
||||
claimed_sites_4 = frontier.claim_sites(4, reclaim_cooldown=0)
|
||||
assert len(claimed_sites_4) == 2
|
||||
|
||||
# clean slate for the next one
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
|
||||
def test_max_claimed_sites_load_perf(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# clean slate
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job_conf = {
|
||||
"id": 1,
|
||||
"seeds": [],
|
||||
"max_claimed_sites": 25,
|
||||
}
|
||||
for i in range(1, 20):
|
||||
job_conf["seeds"].clear()
|
||||
for j in range(0, 1000):
|
||||
job_conf["id"] = i
|
||||
job_conf["seeds"].append({"url": "http://example.com/{}".format(j)})
|
||||
|
||||
assert (len(job_conf["seeds"])) == 1000
|
||||
brozzler.new_job(frontier, job_conf)
|
||||
assert len(list(frontier.job_sites(i))) == 1000
|
||||
|
||||
claim_start_time = time.perf_counter()
|
||||
claimed_sites = frontier.claim_sites(50)
|
||||
claim_end_time = time.perf_counter()
|
||||
assert claim_end_time - claim_start_time < 2
|
||||
assert len(claimed_sites) == 50
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
|
||||
def test_choose_warcprox(rethinker):
|
||||
rr = rethinker
|
||||
svcreg = doublethink.ServiceRegistry(rr)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
@ -1060,8 +1206,8 @@ def test_choose_warcprox():
|
||||
rr.table("services").delete().run()
|
||||
|
||||
|
||||
def test_max_hops_off():
|
||||
rr = doublethink.Rethinker("localhost", db="ignoreme")
|
||||
def test_max_hops_off(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(
|
||||
rr,
|
||||
@ -1120,44 +1266,56 @@ def test_max_hops_off():
|
||||
assert {
|
||||
"brozzle_count": 0,
|
||||
"claimed": False,
|
||||
"failed_attempts": 0,
|
||||
"hashtags": [],
|
||||
"hop_path": "L",
|
||||
"hops_from_seed": 1,
|
||||
"hops_off": 0,
|
||||
"id": brozzler.Page.compute_id(site.id, "http://example.com/toot"),
|
||||
"job_id": None,
|
||||
"needs_robots_check": False,
|
||||
"priority": 12,
|
||||
"retry_after": None,
|
||||
"site_id": site.id,
|
||||
"url": "http://example.com/toot",
|
||||
"via_page_id": seed_page.id,
|
||||
"via_page_url": "http://example.com/",
|
||||
} in pages
|
||||
assert {
|
||||
"brozzle_count": 0,
|
||||
"claimed": False,
|
||||
"failed_attempts": 0,
|
||||
"hashtags": [],
|
||||
"hop_path": "L",
|
||||
"hops_from_seed": 1,
|
||||
"hops_off": 1,
|
||||
"id": brozzler.Page.compute_id(site.id, "http://foo.org/"),
|
||||
"job_id": None,
|
||||
"needs_robots_check": False,
|
||||
"priority": 12,
|
||||
"retry_after": None,
|
||||
"site_id": site.id,
|
||||
"url": "http://foo.org/",
|
||||
"via_page_id": seed_page.id,
|
||||
"via_page_url": "http://example.com/",
|
||||
} in pages
|
||||
assert {
|
||||
"brozzle_count": 0,
|
||||
"claimed": False,
|
||||
"failed_attempts": 0,
|
||||
"hashtags": [],
|
||||
"hop_path": "L",
|
||||
"hops_from_seed": 1,
|
||||
"hops_off": 1,
|
||||
"id": brozzler.Page.compute_id(site.id, "https://example.com/toot"),
|
||||
"job_id": None,
|
||||
"needs_robots_check": False,
|
||||
"priority": 12,
|
||||
"retry_after": None,
|
||||
"site_id": site.id,
|
||||
"url": "https://example.com/toot",
|
||||
"via_page_id": seed_page.id,
|
||||
"via_page_url": "http://example.com/",
|
||||
} in pages
|
||||
|
||||
# next hop is past max_hops_off, but normal in scope url is in scope
|
||||
@ -1173,16 +1331,20 @@ def test_max_hops_off():
|
||||
assert foo_page == {
|
||||
"brozzle_count": 0,
|
||||
"claimed": False,
|
||||
"failed_attempts": 0,
|
||||
"hashtags": [],
|
||||
"hop_path": "L",
|
||||
"hops_from_seed": 1,
|
||||
"hops_off": 1,
|
||||
"id": brozzler.Page.compute_id(site.id, "http://foo.org/"),
|
||||
"job_id": None,
|
||||
"needs_robots_check": False,
|
||||
"priority": 12,
|
||||
"retry_after": None,
|
||||
"site_id": site.id,
|
||||
"url": "http://foo.org/",
|
||||
"via_page_id": seed_page.id,
|
||||
"via_page_url": "http://example.com/",
|
||||
"outlinks": {
|
||||
"accepted": ["http://example.com/blah"],
|
||||
"blocked": [],
|
||||
@ -1194,14 +1356,18 @@ def test_max_hops_off():
|
||||
assert {
|
||||
"brozzle_count": 0,
|
||||
"claimed": False,
|
||||
"failed_attempts": 0,
|
||||
"hashtags": [],
|
||||
"hop_path": "LL",
|
||||
"hops_from_seed": 2,
|
||||
"hops_off": 0,
|
||||
"id": brozzler.Page.compute_id(site.id, "http://example.com/blah"),
|
||||
"job_id": None,
|
||||
"needs_robots_check": False,
|
||||
"priority": 11,
|
||||
"retry_after": None,
|
||||
"site_id": site.id,
|
||||
"url": "http://example.com/blah",
|
||||
"via_page_id": foo_page.id,
|
||||
"via_page_url": "http://foo.org/",
|
||||
} in pages
|
||||
|
@ -262,6 +262,22 @@ blocks:
|
||||
# Some changes to the brozzler ydl interface not represented in this test
|
||||
# https://github.com/internetarchive/brozzler/issues/330
|
||||
@pytest.mark.xfail
|
||||
def test_ydl_proxy_down():
|
||||
sock = socket.socket()
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
for not_listening_proxy in ("127.0.0.1:4", "127.0.0.1:%s" % sock.getsockname()[1]):
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy)
|
||||
site = brozzler.Site(
|
||||
None, {"id": str(uuid.uuid4()), "seed": "http://example.com/"}
|
||||
)
|
||||
page = brozzler.Page(None, {"url": "http://example.com/"})
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
|
||||
def test_proxy_down():
|
||||
"""
|
||||
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||
@ -288,11 +304,6 @@ def test_proxy_down():
|
||||
site, "http://example.com/", proxy=not_listening_proxy
|
||||
)
|
||||
|
||||
# youtube-dl fetch
|
||||
with tempfile.TemporaryDirectory(prefix="brzl-ydl-"):
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
# raw fetch
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker._fetch_url(site, page=page)
|
||||
@ -557,7 +568,7 @@ def test_limit_failures():
|
||||
site = mock.Mock()
|
||||
site.status = "ACTIVE"
|
||||
site.active_brozzling_time = 0
|
||||
site.starts_and_stops = [{"start": datetime.datetime.utcnow()}]
|
||||
site.starts_and_stops = [{"start": datetime.datetime.now(datetime.timezone.utc)}]
|
||||
|
||||
rr = mock.Mock()
|
||||
rr.servers = [mock.Mock()]
|
||||
|
Loading…
x
Reference in New Issue
Block a user