Merge branch 'master' into qa

* master: handle exceptions extracting links fix reported chromium crash by removing argument bump version after merge remove stray bad logging line tests expect outlinks to be a set tidy up some comments and docs watch pages as outlinks from youtube-dl playlists silence youtube-dl's logging, use only our own use a thread-local callback in monkey-patched skip downloading videos from youtube playlists trace-level logging for all the chrome output
2025-08-07 22:12:15 -04:00 · 2018-10-29 17:45:09 -07:00 · 2018-10-29 17:45:09 -07:00 · 27ba877932
commit 27ba877932
parent f3f9505657 1073431f76
6 changed files with 105 additions and 73 deletions
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -166,7 +166,7 @@ class Chrome:
                '--disable-background-networking',
                '--disable-renderer-backgrounding', '--disable-hang-monitor',
                '--disable-background-timer-throttling', '--mute-audio',
-                '--disable-web-sockets', '--disable-cache', '--single-process',
+                '--disable-web-sockets', '--disable-cache',
                '--window-size=1100,900', '--no-default-browser-check',
                '--disable-first-run-ui', '--no-first-run',
                '--homepage=about:blank', '--disable-direct-npapi-requests',
@ -244,33 +244,15 @@ class Chrome:
            while not self._shutdown.is_set():
                buf = readline_nonblock(self.chrome_process.stdout)
                if buf:
-                    if re.search(
-                            b'Xlib:  extension|'
-                            b'CERT_PKIXVerifyCert for [^ ]* failed|'
-                            b'^ALSA lib|ERROR:gl_surface_glx.cc|'
-                            b'ERROR:gpu_child_thread.cc', buf):
-                        self.logger.trace(
-                                'chrome pid %s STDOUT %s',
-                                self.chrome_process.pid, buf)
-                    else:
-                        self.logger.debug(
-                                'chrome pid %s STDOUT %s',
-                                self.chrome_process.pid, buf)
+                    self.logger.trace(
+                            'chrome pid %s STDOUT %s',
+                            self.chrome_process.pid, buf)

                buf = readline_nonblock(self.chrome_process.stderr)
                if buf:
-                    if re.search(
-                            b'Xlib:  extension|'
-                            b'CERT_PKIXVerifyCert for [^ ]* failed|'
-                            b'^ALSA lib|ERROR:gl_surface_glx.cc|'
-                            b'ERROR:gpu_child_thread.cc', buf):
-                        self.logger.trace(
-                                'chrome pid %s STDOUT %s',
-                                self.chrome_process.pid, buf)
-                    else:
-                        self.logger.debug(
-                                'chrome pid %s STDERR %s',
-                                self.chrome_process.pid, buf)
+                    self.logger.trace(
+                            'chrome pid %s STDERR %s',
+                            self.chrome_process.pid, buf)
        except:
            self.logger.error('unexpected exception', exc_info=True)

--- a/brozzler/js-templates/extract-outlinks.js
+++ b/brozzler/js-templates/extract-outlinks.js
@ -3,16 +3,22 @@
 var __brzl_framesDone = new Set();
 var __brzl_compileOutlinks = function(frame) {
    __brzl_framesDone.add(frame);
-    if (frame && frame.document) {
-        var outlinks = Array.prototype.slice.call(
+    var outlinks = [];
+    try {
+        if (frame && frame.document) {
+            outlinks = Array.prototype.slice.call(
                frame.document.querySelectorAll('a[href], area[href]'));
-        for (var i = 0; i < frame.frames.length; i++) {
-            if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
-                outlinks = outlinks.concat(
-                            __brzl_compileOutlinks(frame.frames[i]));
+            for (var i = 0; i < frame.frames.length; i++) {
+                if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
+                    outlinks = outlinks.concat(
+                        __brzl_compileOutlinks(frame.frames[i]));
+                }
            }
        }
+    } catch (e) {
+        console.log("exception looking at frame" + frame + ": " + e);
    }
+
    return outlinks;
 }
 __brzl_compileOutlinks(window).join('\n');
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -186,9 +186,10 @@ class BrozzlerWorker:
                     on_request=None, enable_youtube_dl=True):
        self.logger.info("brozzling {}".format(page))
        ydl_fetches = None
+        outlinks = set()
        if enable_youtube_dl:
            try:
-                ydl_fetches = ydl.do_youtube_dl(self, site, page)
+                ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
            except brozzler.ReachedLimit as e:
                raise
            except brozzler.ShutdownRequested:
@ -210,10 +211,10 @@ class BrozzlerWorker:
        if self._needs_browsing(page, ydl_fetches):
            self.logger.info('needs browsing: %s', page)
            try:
-                outlinks = self._browse_page(browser, site, page, on_screenshot,
-                                            on_request)
+                browser_outlinks = self._browse_page(
+                        browser, site, page, on_screenshot, on_request)
+                outlinks.update(browser_outlinks)
            except brozzler.PageInterstitialShown:
-                outlinks = []
                self.logger.info('page interstitial shown (http auth): %s', page)
            return outlinks
        else:
@ -222,7 +223,8 @@ class BrozzlerWorker:
                self._fetch_url(site, page)
            else:
                self.logger.info('already fetched: %s', page)
-            return []
+
+        return outlinks

    def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
        def _on_screenshot(screenshot_png):
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -1,8 +1,6 @@
 '''
 brozzler/ydl.py - youtube-dl support for brozzler

-This code was extracted from worker.py and 
-
 Copyright (C) 2018 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
@ -30,7 +28,21 @@ import doublethink
 import datetime
 import threading

-global_ydl_lock = threading.Lock()
+thread_local = threading.local()
+_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
+def _finish_frag_download(ffd_self, ctx):
+    '''
+    We monkey-patch this youtube-dl internal method `_finish_frag_download()`
+    because it gets called after downloading the last segment of a segmented
+    video, which is a good time to upload the stitched-up video that youtube-dl
+    creates for us to warcprox. We have it call a thread-local callback
+    since different threads may be youtube-dl'ing at the same time.
+    '''
+    result = _orig__finish_frag_download(ffd_self, ctx)
+    if hasattr(thread_local, 'finish_frag_download_callback'):
+        thread_local.finish_frag_download_callback(ffd_self, ctx)
+    return result
+youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download

 _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
 def _webpage_read_content(self, *args, **kwargs):
@ -111,9 +123,10 @@ def _build_youtube_dl(worker, destdir, site):

    - keeps track of urls fetched using a `YoutubeDLSpy`
    - periodically updates `site.last_claimed` in rethinkdb
-    - if brozzling through warcprox and downloading fragmented (DASH) videos,
-      pushes the stitched together video to warcprox using a
-      WARCPROX_WRITE_RECORD request
+    - if brozzling through warcprox and downloading segmented videos (e.g.
+      HLS), pushes the stitched-up video created by youtube-dl to warcprox
+      using a WARCPROX_WRITE_RECORD request
+    - some logging

    Args:
        worker (brozzler.BrozzlerWorker): the calling brozzler worker
@ -127,12 +140,33 @@ def _build_youtube_dl(worker, destdir, site):
    class _YoutubeDL(youtube_dl.YoutubeDL):
        logger = logging.getLogger(__module__ + "." + __qualname__)

+        def urlopen(self, req):
+            try:
+                url = req.full_url
+            except AttributeError:
+                url = req
+            self.logger.debug('fetching %r', url)
+            return super().urlopen(req)
+
        def add_default_extra_info(self, ie_result, ie, url):
            # hook in some logging
            super().add_default_extra_info(ie_result, ie, url)
            if ie_result.get('_type') == 'playlist':
                self.logger.info(
                        'extractor %r found playlist in %s', ie.IE_NAME, url)
+                if ie.IE_NAME == 'youtube:playlist':
+                    # At this point ie_result['entries'] is an iterator that
+                    # will fetch more metadata from youtube to list all the
+                    # videos. We unroll that iterator here partly because
+                    # otherwise `process_ie_result()` will clobber it, and we
+                    # use it later to extract the watch pages as outlinks.
+                    ie_result['entries_no_dl'] = list(ie_result['entries'])
+                    ie_result['entries'] = []
+                    self.logger.info(
+                            'not downoading %s videos from this youtube '
+                            'playlist because we expect to capture them from '
+                            'individual watch pages',
+                            len(ie_result['entries_no_dl']))
            else:
                self.logger.info(
                        'extractor %r found a video in %s', ie.IE_NAME, url)
@ -180,21 +214,17 @@ def _build_youtube_dl(worker, destdir, site):
                })

        def process_info(self, info_dict):
-            # lock this section to prevent race condition between threads that
-            # want to monkey patch _finish_frag_download() at the same time
-            with global_ydl_lock:
-                _orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
-
-                def _finish_frag_download(ffd_self, ctx):
-                    _orig__finish_frag_download(ffd_self, ctx)
-                    if worker._using_warcprox(site):
-                        self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
-
-                try:
-                    youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
-                    return super().process_info(info_dict)
-                finally:
-                    youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _orig__finish_frag_download
+            '''
+            See comment above on `_finish_frag_download()`
+            '''
+            def ffd_callback(ffd_self, ctx):
+                if worker._using_warcprox(site):
+                    self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
+            try:
+                thread_local.finish_frag_download_callback = ffd_callback
+                return super().process_info(info_dict)
+            finally:
+                delattr(thread_local, 'finish_frag_download_callback')

    def maybe_heartbeat_site_last_claimed(*args, **kwargs):
        # in case youtube-dl takes a long time, heartbeat site.last_claimed
@ -213,20 +243,24 @@ def _build_youtube_dl(worker, destdir, site):

    ydl_opts = {
        "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
-        "verbose": False,
        "retries": 1,
-        "logger": logging.getLogger("youtube_dl"),
        "nocheckcertificate": True,
        "hls_prefer_native": True,
        "noprogress": True,
        "nopart": True,
        "no_color": True,
        "progress_hooks": [maybe_heartbeat_site_last_claimed],
+
         # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
         # "best: Select the best quality format represented by a single
         # file with video and audio."
        "format": "best/bestvideo+bestaudio",
        "youtube_include_dash_manifest": False,
+
+        ### we do our own logging
+        # "logger": logging.getLogger("youtube_dl"),
+        "verbose": False,
+        "quiet": True,
    }
    if worker._proxy_for(site):
        ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
@ -300,11 +334,12 @@ def _try_youtube_dl(worker, ydl, site, page):
                    content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
                    payload=info_json.encode("utf-8"),
                    extra_headers=site.extra_headers())
+        return ie_result
    except brozzler.ShutdownRequested as e:
        raise
-    except BaseException as e:
+    except Exception as e:
        if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
-            pass
+            return None
        elif (hasattr(e, "exc_info")
                and e.exc_info[0] == urllib.error.HTTPError
                and hasattr(e.exc_info[1], "code")
@ -331,16 +366,23 @@ def do_youtube_dl(worker, site, page):
        page (brozzler.Page): the page we are brozzling

    Returns:
-        `list` of `dict`: with info about urls fetched:
-
-            [{
-                'url': ...,
-                'method': ...,
-                'response_code': ...,
-                'response_headers': ...,
-            }, ...]
+        tuple with two entries:
+            `list` of `dict`: with info about urls fetched:
+                [{
+                    'url': ...,
+                    'method': ...,
+                    'response_code': ...,
+                    'response_headers': ...,
+                }, ...]
+            `list` of `str`: outlink urls
    '''
    with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
        ydl = _build_youtube_dl(worker, tempdir, site)
-        _try_youtube_dl(worker, ydl, site, page)
-        return ydl.fetch_spy.fetches
+        ie_result = _try_youtube_dl(worker, ydl, site, page)
+        outlinks = set()
+        if ie_result and ie_result.get('extractor') == 'youtube:playlist':
+            # youtube watch pages as outlinks
+            outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
+                        for e in ie_result.get('entries_no_dl', [])}
+        # any outlinks for other cases?
+        return ydl.fetch_spy.fetches, outlinks
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.5.dev308',
+        version='1.5.dev312',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -2,7 +2,7 @@
 '''
 test_brozzling.py - XXX explain

-Copyright (C) 2016-2017 Internet Archive
+Copyright (C) 2016-2018 Internet Archive

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.