mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 15:55:49 -04:00
chore: Additional frontier testing and reformat
This commit is contained in:
parent
e7e4225bf2
commit
addf73f865
@ -17,7 +17,7 @@ limitations under the License.
|
||||
"""
|
||||
|
||||
import datetime
|
||||
from typing import List, Dict
|
||||
from typing import Dict, List
|
||||
|
||||
import doublethink
|
||||
import rethinkdb as rdb
|
||||
@ -34,7 +34,9 @@ class UnexpectedDbResult(Exception):
|
||||
|
||||
|
||||
def filter_claimable_site_ids(
|
||||
active_sites: List[Dict], max_sites_to_claim=1
|
||||
active_sites: List[Dict],
|
||||
reclaim_cooldown: int,
|
||||
max_sites_to_claim=1,
|
||||
) -> List[str]:
|
||||
job_counts = {}
|
||||
claimable_sites = []
|
||||
@ -45,7 +47,7 @@ def filter_claimable_site_ids(
|
||||
|
||||
# If site not claimed and not disclaimed within last 20 seconds
|
||||
if not site["claimed"] and site.get("last_disclaimed", 0) <= (
|
||||
now - datetime.timedelta(seconds=20)
|
||||
now - datetime.timedelta(seconds=reclaim_cooldown)
|
||||
):
|
||||
is_claimable = True
|
||||
|
||||
@ -176,11 +178,13 @@ class RethinkDbFrontier:
|
||||
)
|
||||
return active_sites
|
||||
|
||||
def claim_sites(self, n=1) -> List[Dict]:
|
||||
def claim_sites(self, n=1, reclaim_cooldown=20) -> List[Dict]:
|
||||
self.logger.debug("claiming up to %s sites to brozzle", n)
|
||||
|
||||
active_sites = self.get_active_sites()
|
||||
site_ids_to_claim = filter_claimable_site_ids(active_sites, n)
|
||||
site_ids_to_claim = filter_claimable_site_ids(
|
||||
active_sites, reclaim_cooldown, max_sites_to_claim=n
|
||||
)
|
||||
result = (
|
||||
self.rr.table("sites", read_mode="majority")
|
||||
.get_all(r.args(site_ids_to_claim))
|
||||
|
@ -271,7 +271,7 @@ def test_proxy_down():
|
||||
chrome_exe = brozzler.suggest_default_chrome_exe()
|
||||
|
||||
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
|
||||
browser.stop() # We're manually instantiating the browser without arguments,
|
||||
browser.stop() # We're manually instantiating the browser without arguments,
|
||||
# so it is running without a proxy. Stop it first.
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker.brozzle_page(browser, site, page)
|
||||
|
@ -76,8 +76,11 @@ def test_run_command(capsys, cmd):
|
||||
)
|
||||
out, err = proc.communicate()
|
||||
# Remove lines from syntax warning in imported library
|
||||
filtered_lines = [line for line in err.decode("utf-8").splitlines() if "reppy" not in line and
|
||||
"re.compile" not in line]
|
||||
filtered_lines = [
|
||||
line
|
||||
for line in err.decode("utf-8").splitlines()
|
||||
if "reppy" not in line and "re.compile" not in line
|
||||
]
|
||||
assert filtered_lines == []
|
||||
assert out == ("brozzler %s - %s\n" % (brozzler.__version__, cmd)).encode("ascii")
|
||||
|
||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
@ -933,6 +934,7 @@ def test_max_claimed_sites(rethinker):
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job_conf = {
|
||||
"id": 1,
|
||||
"seeds": [
|
||||
{"url": "http://example.com/1"},
|
||||
{"url": "http://example.com/2"},
|
||||
@ -942,7 +944,7 @@ def test_max_claimed_sites(rethinker):
|
||||
],
|
||||
"max_claimed_sites": 3,
|
||||
}
|
||||
|
||||
seeds_seen = []
|
||||
job = brozzler.new_job(frontier, job_conf)
|
||||
|
||||
assert job.id
|
||||
@ -962,13 +964,88 @@ def test_max_claimed_sites(rethinker):
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job = brozzler.new_job(frontier, job_conf)
|
||||
claimed_sites = frontier.claim_sites(2)
|
||||
assert len(claimed_sites) == 2
|
||||
claimed_sites = frontier.claim_sites(1)
|
||||
assert len(claimed_sites) == 1
|
||||
|
||||
def test_max_claimed_sites_cross_job(rethinker):
|
||||
rr = rethinker
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# clean slate
|
||||
rr.table("jobs").delete().run()
|
||||
rr.table("sites").delete().run()
|
||||
|
||||
job_conf_1 = {
|
||||
"id": 1,
|
||||
"seeds": [
|
||||
{"url": "http://example.com/1"},
|
||||
{"url": "http://example.com/2"},
|
||||
{"url": "http://example.com/3"},
|
||||
{"url": "http://example.com/4"},
|
||||
{"url": "http://example.com/5"},
|
||||
],
|
||||
"max_claimed_sites": 3,
|
||||
}
|
||||
job_conf_2 = {
|
||||
"id": 2,
|
||||
"seeds": [
|
||||
{"url": "http://example.com/6"},
|
||||
{"url": "http://example.com/7"},
|
||||
{"url": "http://example.com/8"},
|
||||
{"url": "http://example.com/9"},
|
||||
{"url": "http://example.com/10"},
|
||||
],
|
||||
"max_claimed_sites": 3,
|
||||
}
|
||||
|
||||
seeds_seen = []
|
||||
job_1 = brozzler.new_job(frontier, job_conf_1)
|
||||
job_2 = brozzler.new_job(frontier, job_conf_2)
|
||||
|
||||
assert len(list(frontier.job_sites(job_1.id))) == 5
|
||||
assert len(list(frontier.job_sites(job_2.id))) == 5
|
||||
|
||||
claimed_sites_1 = frontier.claim_sites(4)
|
||||
assert len(claimed_sites_1) == 4
|
||||
|
||||
sites_per_job = {}
|
||||
for site in claimed_sites_1:
|
||||
sites_per_job[site["job_id"]] = sites_per_job.get(site["job_id"], 0) + 1
|
||||
|
||||
# 2 jobs, max of 3 each.
|
||||
assert len(sites_per_job.keys()) == 2
|
||||
assert sites_per_job[1] + sites_per_job[2] == 4
|
||||
assert sites_per_job[1] <= 3 and sites_per_job[2] <= 3
|
||||
|
||||
# 6 sites left in queue, but only 2 are still claimable due to max
|
||||
claimed_sites_2 = frontier.claim_sites(6)
|
||||
assert len(claimed_sites_2) == 2
|
||||
|
||||
# disclaim sites
|
||||
for site in itertools.chain(claimed_sites_1, claimed_sites_2):
|
||||
frontier.disclaim_site(site)
|
||||
seeds_seen.append(site["seed"])
|
||||
|
||||
# Only 4 sites left in queue, that aren't recently claimed
|
||||
claimed_sites_3 = frontier.claim_sites(6)
|
||||
assert len(claimed_sites_3) == 4
|
||||
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
claimed_sites = frontier.claim_sites(1)
|
||||
assert len(claimed_sites) == 1
|
||||
|
||||
for site in claimed_sites_3:
|
||||
seeds_seen.append(site["seed"])
|
||||
|
||||
# ensure all sites have been claimed at this point
|
||||
for seed in itertools.chain(job_conf_1["seeds"], job_conf_2["seeds"]):
|
||||
assert seed["url"] in seeds_seen
|
||||
|
||||
# All unclaimed sites have been recently disclaimed and are not claimable
|
||||
with pytest.raises(brozzler.NothingToClaim):
|
||||
frontier.claim_sites(3)
|
||||
|
||||
# Disable reclaim cooldown. With 4 claimed, we should have 2 available
|
||||
claimed_sites_4 = frontier.claim_sites(4, reclaim_cooldown=0)
|
||||
assert len(claimed_sites_4) == 2
|
||||
|
||||
# clean slate for the next one
|
||||
rr.table("jobs").delete().run()
|
||||
|
@ -277,6 +277,7 @@ def test_ydl_proxy_down():
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
brozzler.ydl.do_youtube_dl(worker, site, page)
|
||||
|
||||
|
||||
def test_proxy_down():
|
||||
"""
|
||||
Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.
|
||||
|
Loading…
x
Reference in New Issue
Block a user