diff --git a/bin/brozzler-new-job b/bin/brozzler-new-job index 15fc969..b785a73 100755 --- a/bin/brozzler-new-job +++ b/bin/brozzler-new-job @@ -62,6 +62,7 @@ for seed_conf in seeds: extra_headers = {"Warcprox-Meta":warcprox_meta} site = brozzler.Site(seed=merged_conf["url"], scope=merged_conf.get("scope"), + time_limit=merged_conf.get("time_limit"), proxy=merged_conf.get("proxy"), ignore_robots=merged_conf.get("ignore_robots"), enable_warcprox_features=merged_conf.get("enable_warcprox_features"), diff --git a/brozzler/site.py b/brozzler/site.py index 7c5d2dd..080ba3d 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -9,7 +9,7 @@ class Site: logger = logging.getLogger(__module__ + "." + __qualname__) def __init__(self, seed, id=None, scope=None, proxy=None, - ignore_robots=False, extra_headers=None, + ignore_robots=False, time_limit=None, extra_headers=None, enable_warcprox_features=False, reached_limit=None): self.seed = seed self.id = id @@ -17,6 +17,7 @@ class Site: self.ignore_robots = ignore_robots self.enable_warcprox_features = bool(enable_warcprox_features) self.extra_headers = extra_headers + self.time_limit = time_limit self.reached_limit = reached_limit self.scope = scope or {} diff --git a/brozzler/worker.py b/brozzler/worker.py index 203b220..a2e1efb 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -168,7 +168,7 @@ class BrozzlerWorker: self.logger.info("brozzling site {}".format(site)) ydl = self._youtube_dl(site) th = threading.Thread(target=lambda: self._brozzle_site(browser, ydl, site), - name="BrowsingThread-{}".format(site.scope_surt)) + name="BrowsingThread-{}".format(site.seed)) th.start() except: self._browser_pool.release(browser) @@ -196,7 +196,7 @@ class BrozzlerWorker: if q_empty: if latest_state != "no-unclaimed-sites": - self.logger.info("no unclaimed sites to browse") + # self.logger.info("no unclaimed sites to browse") latest_state = "no-unclaimed-sites" time.sleep(0.5) except OSError as e: