From 8addaf31d5c7b6364f90d5ea86dd3420beba0fe7 Mon Sep 17 00:00:00 2001
From: Vangelis Banos <vangelis@archive.org>
Date: Sun, 4 Oct 2020 15:39:30 +0000
Subject: [PATCH] Add option extract_outlinks_timeout

`Browser.extract_outlinks` has a default `timeout=60` parm that cannot be
changed in any way. (It is always invoked using `extract_outlinks()`.

We add param `extract_outlinks_timeout=60` to `BrozzlerWorker` and
`Browser.browse_page` to allow that.
---
 brozzler/browser.py | 7 +++++--
 brozzler/worker.py  | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/brozzler/browser.py b/brozzler/browser.py
index e0f1f06..1960b9e 100644
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@@ -423,7 +423,8 @@ class Browser:
             username=None, password=None, hashtags=None,
             screenshot_full_page=False, skip_extract_outlinks=False,
             skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
-            page_timeout=300, behavior_timeout=900, download_throughput=-1):
+            page_timeout=300, behavior_timeout=900,
+            extract_outlinks_timeout=60, download_throughput=-1):
         '''
         Browses page in browser.
 
@@ -520,7 +521,9 @@ class Browser:
                 if not run_behaviors or skip_extract_outlinks:
                     outlinks = []
                 else:
-                    outlinks = self.extract_outlinks()
+                    outlinks = self.extract_outlinks(
+                        timeout=extract_outlinks_timeout
+                        )
                 if run_behaviors and not skip_visit_hashtags:
                     self.visit_hashtags(final_page_url, hashtags, outlinks)
                 return final_page_url, outlinks
diff --git a/brozzler/worker.py b/brozzler/worker.py
index 6c88275..d88893b 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -52,7 +52,8 @@ class BrozzlerWorker:
             chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
             skip_extract_outlinks=False, skip_visit_hashtags=False,
             skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
-            page_timeout=300, behavior_timeout=900, download_throughput=-1):
+            page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
+            download_throughput=-1):
         self._frontier = frontier
         self._service_registry = service_registry
         self._max_browsers = max_browsers
@@ -68,6 +69,7 @@ class BrozzlerWorker:
         self._screenshot_full_page = screenshot_full_page
         self._page_timeout = page_timeout
         self._behavior_timeout = behavior_timeout
+        self._extract_outlinks_timeout = extract_outlinks_timeout
         self._download_throughput = download_throughput
 
         self._browser_pool = brozzler.browser.BrowserPool(
@@ -308,6 +310,7 @@ class BrozzlerWorker:
                 screenshot_full_page=self._screenshot_full_page,
                 page_timeout=self._page_timeout,
                 behavior_timeout=self._behavior_timeout,
+                extract_outlinks_timeout=self._extract_outlinks_timeout,
                 download_throughput=self._download_throughput)
         if final_page_url != page.url:
             page.note_redirect(final_page_url)