Merge 51b2474b3cf7488ead37402a5ae5a08c4ee5472d into 8afe9b50143b6f619e568471372362d9c9a3a5d9

2025-04-20 23:56:34 -04:00 · 2023-12-22 08:13:40 -07:00 · 2023-12-22 08:13:40 -07:00 · 0b07b76130
commit 0b07b76130
parent 8afe9b5014 51b2474b3c
3 changed files with 11 additions and 5 deletions
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -347,6 +347,10 @@ def brozzler_worker(argv=None):
            help=(
                'when needed, choose an available instance of warcprox from '
                'the rethinkdb service registry'))
+    arg_parser.add_argument(
+                '--claimed-limit', metavar='int', type=int, default=60, choices=range(15, 61), dest='claimed_limit',
+                help=('Minutes after worker crash, a site can be reclaimed'
+                'an integer in the range 15..60 (default: 60)'))
    arg_parser.add_argument(
            '--skip-extract-outlinks', dest='skip_extract_outlinks',
            action='store_true', help=argparse.SUPPRESS)
@ -393,6 +397,7 @@ def brozzler_worker(argv=None):
            frontier, service_registry, max_browsers=int(args.max_browsers),
            chrome_exe=args.chrome_exe, proxy=args.proxy,
            warcprox_auto=args.warcprox_auto,
+            claimed_limit=args.claimed_limit,
            skip_extract_outlinks=args.skip_extract_outlinks,
            skip_visit_hashtags=args.skip_visit_hashtags,
            skip_youtube_dl=args.skip_youtube_dl,
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -95,7 +95,7 @@ class RethinkDbFrontier:
                    raise UnexpectedDbResult("expected %r to be %r in %r" % (
                        k, expected, result))

-    def claim_sites(self, n=1):
+    def claim_sites(self, n=1, claimed_limit=60):
        self.logger.trace('claiming up to %s sites to brozzle', n)
        result = (
            self.rr.table('sites').get_all(r.args(
@ -116,7 +116,7 @@ class RethinkDbFrontier:
                        r.and_(
                            r.or_(
                                site['claimed'].not_(),
-                                site['last_claimed'].lt(r.now().sub(60*60))),
+                                site['last_claimed'].lt(r.now().sub(claimed_limit*60))),
                            r.or_(
                                site.has_fields('max_claimed_sites').not_(),
                                new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
@ -129,7 +129,7 @@ class RethinkDbFrontier:
                r.branch(
                    r.or_(
                      r.row['claimed'].not_(),
-                      r.row['last_claimed'].lt(r.now().sub(60*60))),
+                      r.row['last_claimed'].lt(r.now().sub(claimed_limit*60))),
                    {'claimed': True, 'last_claimed': r.now()},
                    {}),
                return_changes=True)).run()
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -51,7 +51,7 @@ class BrozzlerWorker:

    def __init__(
            self, frontier, service_registry=None, max_browsers=1,
-            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
+            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, claimed_limit=60,
            skip_extract_outlinks=False, skip_visit_hashtags=False,
            skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
            page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
@ -65,6 +65,7 @@ class BrozzlerWorker:
        self._proxy = proxy
        assert not (warcprox_auto and proxy)
        self._proxy_is_warcprox = None
+        self._claimed_limit = claimed_limit
        self._skip_extract_outlinks = skip_extract_outlinks
        self._skip_visit_hashtags = skip_visit_hashtags
        self._skip_youtube_dl = skip_youtube_dl
@ -499,7 +500,7 @@ class BrozzlerWorker:
        browsers = self._browser_pool.acquire_multi(
                (self._browser_pool.num_available() + 1) // 2)
        try:
-            sites = self._frontier.claim_sites(len(browsers))
+            sites = self._frontier.claim_sites(len(browsers), self._claimed_limit)
        except:
            self._browser_pool.release_all(browsers)
            raise