Merge branch 'pageInterstitialShown' into qa

2025-09-23 06:04:47 -04:00 · 2018-09-25 10:30:02 -07:00 · 2018-09-25 10:30:02 -07:00 · 60cfd684b2
commit 60cfd684b2
parent 3c7fdeae2c d9f7997a40
7 changed files with 67 additions and 13 deletions
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -58,6 +58,28 @@ class ReachedLimit(Exception):
    def __str__(self):
        return self.__repr__()

+class PageInterstitialShown(Exception):
+    def __init__(self, http_error=None, warcprox_meta=None, http_payload=None):
+        import json
+        if http_error:
+            if "warcprox-meta" in http_error.headers:
+                self.warcprox_meta = json.loads(
+                        http_error.headers["warcprox-meta"])
+            else:
+                self.warcprox_meta = None
+            self.http_payload = http_error.read()
+        elif warcprox_meta:
+            self.warcprox_meta = warcprox_meta
+            self.http_payload = http_payload
+
+    def __repr__(self):
+        return "PageInterstitialShown(warcprox_meta=%r,http_payload=%r)" % (
+                self.warcprox_meta if hasattr(self, 'warcprox_meta') else None,
+                self.http_payload if hasattr(self, 'http_payload') else None)
+
+    def __str__(self):
+        return self.__repr__()
+
 # monkey-patch log levels TRACE and NOTICE
 logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
 def _logger_trace(self, msg, *args, **kwargs):
--- a/brozzler/browser.py
+++ b/brozzler/browser.py
@ -241,10 +241,12 @@ class WebsockReceiverThread(threading.Thread):
                if self.on_request:
                    self.on_request(message)
            elif message['method'] == 'Page.interstitialShown':
-                # for AITFIVE-1529: handle http auth
-                # for now, we should consider killing the browser when we receive Page.interstitialShown and
-                # consider the page finished—-first we should figure out when else that event might happen
-                self.logger.info('Page.interstitialShown received')
+                # AITFIVE-1529: handle http auth
+                # we should kill the browser when we receive Page.interstitialShown and
+                # consider the page finished, until this is fixed:
+                # https://bugs.chromium.org/p/chromium/issues/detail?id=764505
+                self.logger.info('Page.interstialShown (likely unsupported http auth request)')
+                brozzler.thread_raise(self.calling_thread, brozzler.PageInterstitialShown)
            elif message['method'] == 'Inspector.targetCrashed':
                self.logger.error(
                        '''chrome tab went "aw snap" or "he's dead jim"!''')
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -199,6 +199,8 @@ def brozzle_page(argv=None):
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
+    except brozzler.PageInterstitialShown as e:
+        logging.error('page interstitial shown %s', e)
    finally:
        browser.stop()

--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -209,9 +209,13 @@ class BrozzlerWorker:

        if self._needs_browsing(page, ydl_fetches):
            self.logger.info('needs browsing: %s', page)
-            outlinks = self._browse_page(browser, site, page, on_screenshot,
-                                         on_request)
-            return outlinks
+            try:
+                outlinks = self._browse_page(browser, site, page, on_screenshot,
+                                            on_request)
+                return outlinks
+            except brozzler.PageInterstitialShown:
+                self.logger.info('page interstitial shown (http auth): %s', page)
+                return []
        else:
            if not self._already_fetched(page, ydl_fetches):
                self.logger.info('needs fetch: %s', page)
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -28,6 +28,9 @@ import os
 import json
 import doublethink
 import datetime
+import threading
+
+global_ydl_lock = threading.Lock()

 _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
 def _webpage_read_content(self, *args, **kwargs):
@ -162,10 +165,12 @@ def _build_youtube_dl(worker, destdir, site):
            with open(ctx['filename'], 'rb') as f:
                # include content-length header to avoid chunked
                # transfer, which warcprox currently rejects
+                extra_headers = dict(site.extra_headers())
+                extra_headers['content-length'] = size
                request, response = worker._warcprox_write_record(
                        warcprox_address=worker._proxy_for(site), url=url,
                        warc_type='resource', content_type=mimetype, payload=f,
-                        extra_headers={'content-length': size})
+                        extra_headers=extra_headers)
                # consulted by _remember_videos()
                self.stitch_ups.append({
                    'url': url,
@ -182,8 +187,14 @@ def _build_youtube_dl(worker, destdir, site):
                if worker._using_warcprox(site):
                    self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)

-            youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
-            return super().process_info(info_dict)
+            # lock this section to prevent race condition between threads that
+            # want to monkey patch _finish_frag_download() at the same time
+            with global_ydl_lock:
+                try:
+                    youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
+                    return super().process_info(info_dict)
+                finally:
+                    youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _orig__finish_frag_download

    def maybe_heartbeat_site_last_claimed(*args, **kwargs):
        # in case youtube-dl takes a long time, heartbeat site.last_claimed
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.5.dev303',
+        version='1.5.dev304',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -801,7 +801,10 @@ def test_ydl_stitching(httpd):
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/site10/' % httpd.server_port})
+        'seed': 'http://localhost:%s/site10/' % httpd.server_port,
+        'warcprox_meta':  {
+            'warc-prefix': 'test_ydl_stitching',
+            'captures-table-extra-fields': {'test_id':test_id}}})
    brozzler.new_site(frontier, site)

    # the site should be brozzled fairly quickly
@ -816,11 +819,21 @@ def test_ydl_stitching(httpd):
    assert len(pages) == 1
    page = pages[0]
    assert len(page.videos) == 6
+    stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port
    assert {
        'blame': 'youtube-dl',
        'content-length': 267900,
        'content-type': 'video/mp4',
        'response_code': 204,
-        'url': 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port,
+        'url': stitched_url,
    } in page.videos

+    time.sleep(2)   # in case warcprox hasn't finished processing urls
+    # take a look at the captures table
+    captures = list(rr.table('captures').filter({'test_id':test_id}).run())
+    l = [c for c in captures if c['url'] == stitched_url]
+    assert len(l) == 1
+    c = l[0]
+    assert c['filename'].startswith('test_ydl_stitching')
+    assert c['content_type'] == 'video/mp4'
+    assert c['http_method'] == 'WARCPROX_WRITE_RECORD'