From e406e42312c706ce97b91ec331c1602843886121 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 11 Oct 2018 15:43:05 -0700 Subject: [PATCH 01/11] trace-level logging for all the chrome output because the important/unimportant messages are always shifting and we're not even trying to keep up with it and mostly it's just noise --- brozzler/chrome.py | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 450f352..af6bf5a 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -244,33 +244,15 @@ class Chrome: while not self._shutdown.is_set(): buf = readline_nonblock(self.chrome_process.stdout) if buf: - if re.search( - b'Xlib: extension|' - b'CERT_PKIXVerifyCert for [^ ]* failed|' - b'^ALSA lib|ERROR:gl_surface_glx.cc|' - b'ERROR:gpu_child_thread.cc', buf): - self.logger.trace( - 'chrome pid %s STDOUT %s', - self.chrome_process.pid, buf) - else: - self.logger.debug( - 'chrome pid %s STDOUT %s', - self.chrome_process.pid, buf) + self.logger.trace( + 'chrome pid %s STDOUT %s', + self.chrome_process.pid, buf) buf = readline_nonblock(self.chrome_process.stderr) if buf: - if re.search( - b'Xlib: extension|' - b'CERT_PKIXVerifyCert for [^ ]* failed|' - b'^ALSA lib|ERROR:gl_surface_glx.cc|' - b'ERROR:gpu_child_thread.cc', buf): - self.logger.trace( - 'chrome pid %s STDOUT %s', - self.chrome_process.pid, buf) - else: - self.logger.debug( - 'chrome pid %s STDERR %s', - self.chrome_process.pid, buf) + self.logger.trace( + 'chrome pid %s STDERR %s', + self.chrome_process.pid, buf) except: self.logger.error('unexpected exception', exc_info=True) From 82cf5c6dbb942cc1f23b168eece85b0fe0702b38 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 11 Oct 2018 15:46:30 -0700 Subject: [PATCH 02/11] skip downloading videos from youtube playlists because we expect to capture videos from individual watch pages, and often processing thousands of videos with youtube-dl before the page is ever opened in the browser is not desired behavior and is a crawling problem --- brozzler/ydl.py | 19 +++++++++++++++++++ setup.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e2e995b..d67856c 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -127,12 +127,31 @@ def _build_youtube_dl(worker, destdir, site): class _YoutubeDL(youtube_dl.YoutubeDL): logger = logging.getLogger(__module__ + "." + __qualname__) + def urlopen(self, req): + try: + url = req.full_url + except AttributeError: + url = req + self.logger.debug('fetching %r', url) + return super().urlopen(req) + + # def _match_entry(self, info_dict, incomplete): + # return super()._match_entry(info_dict, incomplete) + def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) if ie_result.get('_type') == 'playlist': self.logger.info( 'extractor %r found playlist in %s', ie.IE_NAME, url) + if ie.IE_NAME == 'youtube:playlist': + self.logger.info( + 'setting skip_download because this is a youtube ' + 'playlist and we expect to capture videos from ' + 'individual watch pages') + # XXX good enuf? still fetches metadata for each video + # if we want to not do that, implement self._match_entry() + self.params['skip_download'] = True else: self.logger.info( 'extractor %r found a video in %s', ie.IE_NAME, url) diff --git a/setup.py b/setup.py index bb405cd..9752d7c 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev308', + version='1.5.dev309', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From e5536182dc8055c6b6352b184a15b508f32e1388 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 11 Oct 2018 23:28:34 -0700 Subject: [PATCH 03/11] use a thread-local callback in monkey-patched finish_frag_download, instead of locking around monkey-patching, to allow different threads to youtube-dl concurrently, but still not interfere with each other --- brozzler/ydl.py | 50 ++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index d67856c..4c16e17 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -30,7 +30,21 @@ import doublethink import datetime import threading -global_ydl_lock = threading.Lock() +thread_local = threading.local() +_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download +def _finish_frag_download(ffd_self, ctx): + ''' + We monkey-patch this youtube-dl internal method `_finish_frag_download()` + because it gets called after downloading the last segment of a segmented + video, which is a good time to upload the stitched-up video that youtube-dl + creates for us to warcprox. We have it call a thread-local callback + since different threads may be youtube-dl'ing at the same time. + ''' + result = _orig__finish_frag_download(ffd_self, ctx) + if hasattr(thread_local, 'finish_frag_download_callback'): + thread_local.finish_frag_download_callback(ffd_self, ctx) + return result +youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content def _webpage_read_content(self, *args, **kwargs): @@ -111,9 +125,10 @@ def _build_youtube_dl(worker, destdir, site): - keeps track of urls fetched using a `YoutubeDLSpy` - periodically updates `site.last_claimed` in rethinkdb - - if brozzling through warcprox and downloading fragmented (DASH) videos, - pushes the stitched together video to warcprox using a - WARCPROX_WRITE_RECORD request + - if brozzling through warcprox and downloading segmented videos (e.g. + HLS), pushes the stitched-up video created by youtube-dl to warcprox + using a WARCPROX_WRITE_RECORD request + - adds some logging Args: worker (brozzler.BrozzlerWorker): the calling brozzler worker @@ -199,21 +214,18 @@ def _build_youtube_dl(worker, destdir, site): }) def process_info(self, info_dict): - # lock this section to prevent race condition between threads that - # want to monkey patch _finish_frag_download() at the same time - with global_ydl_lock: - _orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download - - def _finish_frag_download(ffd_self, ctx): - _orig__finish_frag_download(ffd_self, ctx) - if worker._using_warcprox(site): - self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx) - - try: - youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download - return super().process_info(info_dict) - finally: - youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _orig__finish_frag_download + ''' + See comment above on `_finish_frag_download()` + ''' + def ffd_callback(ffd_self, ctx): + logging.info('%s') + if worker._using_warcprox(site): + self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx) + try: + thread_local.finish_frag_download_callback = ffd_callback + return super().process_info(info_dict) + finally: + delattr(thread_local, 'finish_frag_download_callback') def maybe_heartbeat_site_last_claimed(*args, **kwargs): # in case youtube-dl takes a long time, heartbeat site.last_claimed From 9211fb45ec90c49489e51c0a64e6f9ac1177e9b0 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 00:39:37 -0700 Subject: [PATCH 04/11] silence youtube-dl's logging, use only our own because youtube-dl's can be annoyingly verbose, confusing, doesn't tell us the things we're interested in, and doesn't tell us where the messages originate --- brozzler/ydl.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4c16e17..e0f5a86 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -244,19 +244,23 @@ def _build_youtube_dl(worker, destdir, site): ydl_opts = { "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir), - "verbose": False, "retries": 1, - "logger": logging.getLogger("youtube_dl"), "nocheckcertificate": True, "hls_prefer_native": True, "noprogress": True, "nopart": True, "no_color": True, "progress_hooks": [maybe_heartbeat_site_last_claimed], + # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection # "best: Select the best quality format represented by a single # file with video and audio." "format": "best/bestvideo+bestaudio", + + ### we do our own logging + # "logger": logging.getLogger("youtube_dl"), + "verbose": False, + "quiet": True, } if worker._proxy_for(site): ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) From 8f9077fbf34c90d60c545fe97bbb9b5014f39364 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 00:41:16 -0700 Subject: [PATCH 05/11] watch pages as outlinks from youtube-dl playlists and bypass downloading metadata about individual videos as well as the videos themselves (for youtube playlists), because even just the metadata can take many minutes or hours in case of thousands of videos --- brozzler/worker.py | 15 ++++++++++----- brozzler/ydl.py | 39 +++++++++++++++++++++++++++++---------- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index a37e3f0..cf36210 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -186,9 +186,10 @@ class BrozzlerWorker: on_request=None, enable_youtube_dl=True): self.logger.info("brozzling {}".format(page)) ydl_fetches = None + ydl_outlinks = [] if enable_youtube_dl: try: - ydl_fetches = ydl.do_youtube_dl(self, site, page) + ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -207,18 +208,22 @@ class BrozzlerWorker: 'youtube_dl raised exception on %s', page, exc_info=True) + browser_outlinks = [] if self._needs_browsing(page, ydl_fetches): self.logger.info('needs browsing: %s', page) - outlinks = self._browse_page(browser, site, page, on_screenshot, - on_request) - return outlinks + browser_outlinks = self._browse_page( + browser, site, page, on_screenshot, on_request) else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) self._fetch_url(site, page) else: self.logger.info('already fetched: %s', page) - return [] + + outlinks = set() + outlinks.update(ydl_outlinks) + outlinks.update(browser_outlinks) + return list(outlinks) def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e0f5a86..cfdf43e 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -151,7 +151,15 @@ def _build_youtube_dl(worker, destdir, site): return super().urlopen(req) # def _match_entry(self, info_dict, incomplete): - # return super()._match_entry(info_dict, incomplete) + # if self.dl_disabled: + # return 'Downloading disabled (probably youtube playlist)' + + # def extract_info(self, *args, **kwargs): + # self.dl_disabled = False + # try: + # return super().extract_info(*args, **kwargs) + # finally: + # self.dl_disabled = False def add_default_extra_info(self, ie_result, ie, url): # hook in some logging @@ -160,13 +168,19 @@ def _build_youtube_dl(worker, destdir, site): self.logger.info( 'extractor %r found playlist in %s', ie.IE_NAME, url) if ie.IE_NAME == 'youtube:playlist': + # At this point ie_result['entries'] is an iterator that + # will fetch more metadata from youtube to list all the + # videos. We unroll that iterator here partly because + # otherwise `process_ie_result()` will clobber it, and we + # use it later to extract the watch pages as outlinks. + ie_result['entries_no_dl'] = list(ie_result['entries']) + ie_result['entries'] = [] self.logger.info( 'setting skip_download because this is a youtube ' - 'playlist and we expect to capture videos from ' - 'individual watch pages') - # XXX good enuf? still fetches metadata for each video - # if we want to not do that, implement self._match_entry() - self.params['skip_download'] = True + 'playlist (%s entries) and we expect to capture ' + 'videos from individual watch pages', + len(ie_result['entries_no_dl'])) + # self.dl_disabled = True else: self.logger.info( 'extractor %r found a video in %s', ie.IE_NAME, url) @@ -334,11 +348,12 @@ def _try_youtube_dl(worker, ydl, site, page): content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) + return ie_result except brozzler.ShutdownRequested as e: raise - except BaseException as e: + except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: - pass + return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") @@ -376,5 +391,9 @@ def do_youtube_dl(worker, site, page): ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) - _try_youtube_dl(worker, ydl, site, page) - return ydl.fetch_spy.fetches + ie_result = _try_youtube_dl(worker, ydl, site, page) + outlinks = [] + if ie_result['extractor'] == 'youtube:playlist': + outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] + for e in ie_result.get('entries_no_dl', [])] + return ydl.fetch_spy.fetches, outlinks From 054ba6d7a038f05d0854cc2d216baed3e9888ff5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 00:48:38 -0700 Subject: [PATCH 06/11] tidy up some comments and docs --- brozzler/ydl.py | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index cfdf43e..f1b57cb 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -1,8 +1,6 @@ ''' brozzler/ydl.py - youtube-dl support for brozzler -This code was extracted from worker.py and - Copyright (C) 2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); @@ -128,7 +126,7 @@ def _build_youtube_dl(worker, destdir, site): - if brozzling through warcprox and downloading segmented videos (e.g. HLS), pushes the stitched-up video created by youtube-dl to warcprox using a WARCPROX_WRITE_RECORD request - - adds some logging + - some logging Args: worker (brozzler.BrozzlerWorker): the calling brozzler worker @@ -150,17 +148,6 @@ def _build_youtube_dl(worker, destdir, site): self.logger.debug('fetching %r', url) return super().urlopen(req) - # def _match_entry(self, info_dict, incomplete): - # if self.dl_disabled: - # return 'Downloading disabled (probably youtube playlist)' - - # def extract_info(self, *args, **kwargs): - # self.dl_disabled = False - # try: - # return super().extract_info(*args, **kwargs) - # finally: - # self.dl_disabled = False - def add_default_extra_info(self, ie_result, ie, url): # hook in some logging super().add_default_extra_info(ie_result, ie, url) @@ -176,11 +163,10 @@ def _build_youtube_dl(worker, destdir, site): ie_result['entries_no_dl'] = list(ie_result['entries']) ie_result['entries'] = [] self.logger.info( - 'setting skip_download because this is a youtube ' - 'playlist (%s entries) and we expect to capture ' - 'videos from individual watch pages', + 'not downoading %s videos from this youtube ' + 'playlist because we expect to capture them from ' + 'individual watch pages', len(ie_result['entries_no_dl'])) - # self.dl_disabled = True else: self.logger.info( 'extractor %r found a video in %s', ie.IE_NAME, url) @@ -380,20 +366,23 @@ def do_youtube_dl(worker, site, page): page (brozzler.Page): the page we are brozzling Returns: - `list` of `dict`: with info about urls fetched: - - [{ - 'url': ..., - 'method': ..., - 'response_code': ..., - 'response_headers': ..., - }, ...] + tuple with two entries: + `list` of `dict`: with info about urls fetched: + [{ + 'url': ..., + 'method': ..., + 'response_code': ..., + 'response_headers': ..., + }, ...] + `list` of `str`: outlink urls ''' with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) ie_result = _try_youtube_dl(worker, ydl, site, page) outlinks = [] if ie_result['extractor'] == 'youtube:playlist': + # youtube watch pages as outlinks outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] for e in ie_result.get('entries_no_dl', [])] + # any outlinks for other cases? return ydl.fetch_spy.fetches, outlinks From 7497b7e5ac8b1f2ca159e7fd4ddaaed64464c857 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 11:03:54 -0700 Subject: [PATCH 07/11] tests expect outlinks to be a set --- brozzler/worker.py | 11 ++++------- brozzler/ydl.py | 8 ++++---- tests/test_brozzling.py | 2 +- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index cf36210..4aad2a3 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -186,10 +186,10 @@ class BrozzlerWorker: on_request=None, enable_youtube_dl=True): self.logger.info("brozzling {}".format(page)) ydl_fetches = None - ydl_outlinks = [] + outlinks = set() if enable_youtube_dl: try: - ydl_fetches, ydl_outlinks = ydl.do_youtube_dl(self, site, page) + ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: raise except brozzler.ShutdownRequested: @@ -208,11 +208,11 @@ class BrozzlerWorker: 'youtube_dl raised exception on %s', page, exc_info=True) - browser_outlinks = [] if self._needs_browsing(page, ydl_fetches): self.logger.info('needs browsing: %s', page) browser_outlinks = self._browse_page( browser, site, page, on_screenshot, on_request) + outlinks.update(browser_outlinks) else: if not self._already_fetched(page, ydl_fetches): self.logger.info('needs fetch: %s', page) @@ -220,10 +220,7 @@ class BrozzlerWorker: else: self.logger.info('already fetched: %s', page) - outlinks = set() - outlinks.update(ydl_outlinks) - outlinks.update(browser_outlinks) - return list(outlinks) + return outlinks def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _on_screenshot(screenshot_png): diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f1b57cb..4f9169c 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -379,10 +379,10 @@ def do_youtube_dl(worker, site, page): with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = _build_youtube_dl(worker, tempdir, site) ie_result = _try_youtube_dl(worker, ydl, site, page) - outlinks = [] - if ie_result['extractor'] == 'youtube:playlist': + outlinks = set() + if ie_result and ie_result.get('extractor') == 'youtube:playlist': # youtube watch pages as outlinks - outlinks = ['https://www.youtube.com/watch?v=%s' % e['id'] - for e in ie_result.get('entries_no_dl', [])] + outlinks = {'https://www.youtube.com/watch?v=%s' % e['id'] + for e in ie_result.get('entries_no_dl', [])} # any outlinks for other cases? return ydl.fetch_spy.fetches, outlinks diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 8c2c9c8..686c8f4 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -2,7 +2,7 @@ ''' test_brozzling.py - XXX explain -Copyright (C) 2016-2017 Internet Archive +Copyright (C) 2016-2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 65fad5e8bf32e2d8080097e2c78a64f579a173a1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 11:35:47 -0700 Subject: [PATCH 08/11] remove stray bad logging line --- brozzler/ydl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4f9169c..57550e5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -218,7 +218,6 @@ def _build_youtube_dl(worker, destdir, site): See comment above on `_finish_frag_download()` ''' def ffd_callback(ffd_self, ctx): - logging.info('%s') if worker._using_warcprox(site): self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx) try: From 20996fa501c98a7f884b3a062b2ee0cf3f34264e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 12 Oct 2018 12:46:09 -0700 Subject: [PATCH 09/11] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9752d7c..17cab60 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev309', + version='1.5.dev310', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From af85f289089cc941b0e6beebe903e95497843387 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 22 Oct 2018 14:28:31 -0700 Subject: [PATCH 10/11] fix reported chromium crash by removing argument --single-process https://github.com/internetarchive/brozzler/issues/128 --- brozzler/chrome.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index af6bf5a..28ea507 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -166,7 +166,7 @@ class Chrome: '--disable-background-networking', '--disable-renderer-backgrounding', '--disable-hang-monitor', '--disable-background-timer-throttling', '--mute-audio', - '--disable-web-sockets', '--disable-cache', '--single-process', + '--disable-web-sockets', '--disable-cache', '--window-size=1100,900', '--no-default-browser-check', '--disable-first-run-ui', '--no-first-run', '--homepage=about:blank', '--disable-direct-npapi-requests', diff --git a/setup.py b/setup.py index 17cab60..d0f8cff 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev310', + version='1.5.dev311', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 1073431f76f7d427c1af7b4acaa3b214d28e7f82 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 29 Oct 2018 17:42:25 -0700 Subject: [PATCH 11/11] handle exceptions extracting links like this one: Uncaught DOMException: Blocked a frame with origin "https://www.youtube.com" from accessing a cross-origin frame. at __brzl_compileOutlinks (:4:24) at __brzl_compileOutlinks (:10:29) at :16:1 __brzl_compileOutlinks @ VM194:4 __brzl_compileOutlinks @ VM194:10 not sure exactly why this happens but we just have to handle it --- brozzler/js-templates/extract-outlinks.js | 18 ++++++++++++------ setup.py | 2 +- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/brozzler/js-templates/extract-outlinks.js b/brozzler/js-templates/extract-outlinks.js index e9e8a47..e3a04ca 100644 --- a/brozzler/js-templates/extract-outlinks.js +++ b/brozzler/js-templates/extract-outlinks.js @@ -3,16 +3,22 @@ var __brzl_framesDone = new Set(); var __brzl_compileOutlinks = function(frame) { __brzl_framesDone.add(frame); - if (frame && frame.document) { - var outlinks = Array.prototype.slice.call( + var outlinks = []; + try { + if (frame && frame.document) { + outlinks = Array.prototype.slice.call( frame.document.querySelectorAll('a[href], area[href]')); - for (var i = 0; i < frame.frames.length; i++) { - if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { - outlinks = outlinks.concat( - __brzl_compileOutlinks(frame.frames[i])); + for (var i = 0; i < frame.frames.length; i++) { + if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { + outlinks = outlinks.concat( + __brzl_compileOutlinks(frame.frames[i])); + } } } + } catch (e) { + console.log("exception looking at frame" + frame + ": " + e); } + return outlinks; } __brzl_compileOutlinks(window).join('\n'); diff --git a/setup.py b/setup.py index d0f8cff..948bb4f 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.dev311', + version='1.5.dev312', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',