From 8addaf31d5c7b6364f90d5ea86dd3420beba0fe7 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Sun, 4 Oct 2020 15:39:30 +0000 Subject: [PATCH] Add option extract_outlinks_timeout `Browser.extract_outlinks` has a default `timeout=60` parm that cannot be changed in any way. (It is always invoked using `extract_outlinks()`. We add param `extract_outlinks_timeout=60` to `BrozzlerWorker` and `Browser.browse_page` to allow that. --- brozzler/browser.py | 7 +++++-- brozzler/worker.py | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/brozzler/browser.py b/brozzler/browser.py index e0f1f06..1960b9e 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -423,7 +423,8 @@ class Browser: username=None, password=None, hashtags=None, screenshot_full_page=False, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, - page_timeout=300, behavior_timeout=900, download_throughput=-1): + page_timeout=300, behavior_timeout=900, + extract_outlinks_timeout=60, download_throughput=-1): ''' Browses page in browser. @@ -520,7 +521,9 @@ class Browser: if not run_behaviors or skip_extract_outlinks: outlinks = [] else: - outlinks = self.extract_outlinks() + outlinks = self.extract_outlinks( + timeout=extract_outlinks_timeout + ) if run_behaviors and not skip_visit_hashtags: self.visit_hashtags(final_page_url, hashtags, outlinks) return final_page_url, outlinks diff --git a/brozzler/worker.py b/brozzler/worker.py index 6c88275..d88893b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -52,7 +52,8 @@ class BrozzlerWorker: chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, - page_timeout=300, behavior_timeout=900, download_throughput=-1): + page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, + download_throughput=-1): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -68,6 +69,7 @@ class BrozzlerWorker: self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout self._behavior_timeout = behavior_timeout + self._extract_outlinks_timeout = extract_outlinks_timeout self._download_throughput = download_throughput self._browser_pool = brozzler.browser.BrowserPool( @@ -308,6 +310,7 @@ class BrozzlerWorker: screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout, + extract_outlinks_timeout=self._extract_outlinks_timeout, download_throughput=self._download_throughput) if final_page_url != page.url: page.note_redirect(final_page_url)