mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
commit
bb7594a14d
@ -423,7 +423,8 @@ class Browser:
|
||||
username=None, password=None, hashtags=None,
|
||||
screenshot_full_page=False, skip_extract_outlinks=False,
|
||||
skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False,
|
||||
page_timeout=300, behavior_timeout=900, download_throughput=-1):
|
||||
page_timeout=300, behavior_timeout=900,
|
||||
extract_outlinks_timeout=60, download_throughput=-1):
|
||||
'''
|
||||
Browses page in browser.
|
||||
|
||||
@ -520,7 +521,9 @@ class Browser:
|
||||
if not run_behaviors or skip_extract_outlinks:
|
||||
outlinks = []
|
||||
else:
|
||||
outlinks = self.extract_outlinks()
|
||||
outlinks = self.extract_outlinks(
|
||||
timeout=extract_outlinks_timeout
|
||||
)
|
||||
if run_behaviors and not skip_visit_hashtags:
|
||||
self.visit_hashtags(final_page_url, hashtags, outlinks)
|
||||
return final_page_url, outlinks
|
||||
|
@ -52,7 +52,8 @@ class BrozzlerWorker:
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
||||
page_timeout=300, behavior_timeout=900, download_throughput=-1):
|
||||
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
|
||||
download_throughput=-1):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
@ -68,6 +69,7 @@ class BrozzlerWorker:
|
||||
self._screenshot_full_page = screenshot_full_page
|
||||
self._page_timeout = page_timeout
|
||||
self._behavior_timeout = behavior_timeout
|
||||
self._extract_outlinks_timeout = extract_outlinks_timeout
|
||||
self._download_throughput = download_throughput
|
||||
|
||||
self._browser_pool = brozzler.browser.BrowserPool(
|
||||
@ -308,6 +310,7 @@ class BrozzlerWorker:
|
||||
screenshot_full_page=self._screenshot_full_page,
|
||||
page_timeout=self._page_timeout,
|
||||
behavior_timeout=self._behavior_timeout,
|
||||
extract_outlinks_timeout=self._extract_outlinks_timeout,
|
||||
download_throughput=self._download_throughput)
|
||||
if final_page_url != page.url:
|
||||
page.note_redirect(final_page_url)
|
||||
|
Loading…
x
Reference in New Issue
Block a user