Merge branch 'master' into qa

* master:
  handle exceptions extracting links
  fix reported chromium crash by removing argument
  bump version after merge
  remove stray bad logging line
  tests expect outlinks to be a set
  tidy up some comments and docs
  watch pages as outlinks from youtube-dl playlists
  silence youtube-dl's logging, use only our own
  use a thread-local callback in monkey-patched
  skip downloading videos from youtube playlists
  trace-level logging for all the chrome output
This commit is contained in:
Noah Levitt 2018-10-29 17:45:09 -07:00
commit 27ba877932
6 changed files with 105 additions and 73 deletions

View file

@ -166,7 +166,7 @@ class Chrome:
'--disable-background-networking', '--disable-background-networking',
'--disable-renderer-backgrounding', '--disable-hang-monitor', '--disable-renderer-backgrounding', '--disable-hang-monitor',
'--disable-background-timer-throttling', '--mute-audio', '--disable-background-timer-throttling', '--mute-audio',
'--disable-web-sockets', '--disable-cache', '--single-process', '--disable-web-sockets', '--disable-cache',
'--window-size=1100,900', '--no-default-browser-check', '--window-size=1100,900', '--no-default-browser-check',
'--disable-first-run-ui', '--no-first-run', '--disable-first-run-ui', '--no-first-run',
'--homepage=about:blank', '--disable-direct-npapi-requests', '--homepage=about:blank', '--disable-direct-npapi-requests',
@ -244,33 +244,15 @@ class Chrome:
while not self._shutdown.is_set(): while not self._shutdown.is_set():
buf = readline_nonblock(self.chrome_process.stdout) buf = readline_nonblock(self.chrome_process.stdout)
if buf: if buf:
if re.search( self.logger.trace(
b'Xlib: extension|' 'chrome pid %s STDOUT %s',
b'CERT_PKIXVerifyCert for [^ ]* failed|' self.chrome_process.pid, buf)
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
self.logger.trace(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
self.logger.debug(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
buf = readline_nonblock(self.chrome_process.stderr) buf = readline_nonblock(self.chrome_process.stderr)
if buf: if buf:
if re.search( self.logger.trace(
b'Xlib: extension|' 'chrome pid %s STDERR %s',
b'CERT_PKIXVerifyCert for [^ ]* failed|' self.chrome_process.pid, buf)
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
b'ERROR:gpu_child_thread.cc', buf):
self.logger.trace(
'chrome pid %s STDOUT %s',
self.chrome_process.pid, buf)
else:
self.logger.debug(
'chrome pid %s STDERR %s',
self.chrome_process.pid, buf)
except: except:
self.logger.error('unexpected exception', exc_info=True) self.logger.error('unexpected exception', exc_info=True)

View file

@ -3,16 +3,22 @@
var __brzl_framesDone = new Set(); var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) { var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame); __brzl_framesDone.add(frame);
if (frame && frame.document) { var outlinks = [];
var outlinks = Array.prototype.slice.call( try {
if (frame && frame.document) {
outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href], area[href]')); frame.document.querySelectorAll('a[href], area[href]'));
for (var i = 0; i < frame.frames.length; i++) { for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
outlinks = outlinks.concat( outlinks = outlinks.concat(
__brzl_compileOutlinks(frame.frames[i])); __brzl_compileOutlinks(frame.frames[i]));
}
} }
} }
} catch (e) {
console.log("exception looking at frame" + frame + ": " + e);
} }
return outlinks; return outlinks;
} }
__brzl_compileOutlinks(window).join('\n'); __brzl_compileOutlinks(window).join('\n');

View file

@ -186,9 +186,10 @@ class BrozzlerWorker:
on_request=None, enable_youtube_dl=True): on_request=None, enable_youtube_dl=True):
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
ydl_fetches = None ydl_fetches = None
outlinks = set()
if enable_youtube_dl: if enable_youtube_dl:
try: try:
ydl_fetches = ydl.do_youtube_dl(self, site, page) ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
raise raise
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
@ -210,10 +211,10 @@ class BrozzlerWorker:
if self._needs_browsing(page, ydl_fetches): if self._needs_browsing(page, ydl_fetches):
self.logger.info('needs browsing: %s', page) self.logger.info('needs browsing: %s', page)
try: try:
outlinks = self._browse_page(browser, site, page, on_screenshot, browser_outlinks = self._browse_page(
on_request) browser, site, page, on_screenshot, on_request)
outlinks.update(browser_outlinks)
except brozzler.PageInterstitialShown: except brozzler.PageInterstitialShown:
outlinks = []
self.logger.info('page interstitial shown (http auth): %s', page) self.logger.info('page interstitial shown (http auth): %s', page)
return outlinks return outlinks
else: else:
@ -222,7 +223,8 @@ class BrozzlerWorker:
self._fetch_url(site, page) self._fetch_url(site, page)
else: else:
self.logger.info('already fetched: %s', page) self.logger.info('already fetched: %s', page)
return []
return outlinks
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_png): def _on_screenshot(screenshot_png):

View file

@ -1,8 +1,6 @@
''' '''
brozzler/ydl.py - youtube-dl support for brozzler brozzler/ydl.py - youtube-dl support for brozzler
This code was extracted from worker.py and
Copyright (C) 2018 Internet Archive Copyright (C) 2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
@ -30,7 +28,21 @@ import doublethink
import datetime import datetime
import threading import threading
global_ydl_lock = threading.Lock() thread_local = threading.local()
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
def _finish_frag_download(ffd_self, ctx):
'''
We monkey-patch this youtube-dl internal method `_finish_frag_download()`
because it gets called after downloading the last segment of a segmented
video, which is a good time to upload the stitched-up video that youtube-dl
creates for us to warcprox. We have it call a thread-local callback
since different threads may be youtube-dl'ing at the same time.
'''
result = _orig__finish_frag_download(ffd_self, ctx)
if hasattr(thread_local, 'finish_frag_download_callback'):
thread_local.finish_frag_download_callback(ffd_self, ctx)
return result
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
def _webpage_read_content(self, *args, **kwargs): def _webpage_read_content(self, *args, **kwargs):
@ -111,9 +123,10 @@ def _build_youtube_dl(worker, destdir, site):
- keeps track of urls fetched using a `YoutubeDLSpy` - keeps track of urls fetched using a `YoutubeDLSpy`
- periodically updates `site.last_claimed` in rethinkdb - periodically updates `site.last_claimed` in rethinkdb
- if brozzling through warcprox and downloading fragmented (DASH) videos, - if brozzling through warcprox and downloading segmented videos (e.g.
pushes the stitched together video to warcprox using a HLS), pushes the stitched-up video created by youtube-dl to warcprox
WARCPROX_WRITE_RECORD request using a WARCPROX_WRITE_RECORD request
- some logging
Args: Args:
worker (brozzler.BrozzlerWorker): the calling brozzler worker worker (brozzler.BrozzlerWorker): the calling brozzler worker
@ -127,12 +140,33 @@ def _build_youtube_dl(worker, destdir, site):
class _YoutubeDL(youtube_dl.YoutubeDL): class _YoutubeDL(youtube_dl.YoutubeDL):
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
def urlopen(self, req):
try:
url = req.full_url
except AttributeError:
url = req
self.logger.debug('fetching %r', url)
return super().urlopen(req)
def add_default_extra_info(self, ie_result, ie, url): def add_default_extra_info(self, ie_result, ie, url):
# hook in some logging # hook in some logging
super().add_default_extra_info(ie_result, ie, url) super().add_default_extra_info(ie_result, ie, url)
if ie_result.get('_type') == 'playlist': if ie_result.get('_type') == 'playlist':
self.logger.info( self.logger.info(
'extractor %r found playlist in %s', ie.IE_NAME, url) 'extractor %r found playlist in %s', ie.IE_NAME, url)
if ie.IE_NAME == 'youtube:playlist':
# At this point ie_result['entries'] is an iterator that
# will fetch more metadata from youtube to list all the
# videos. We unroll that iterator here partly because
# otherwise `process_ie_result()` will clobber it, and we
# use it later to extract the watch pages as outlinks.
ie_result['entries_no_dl'] = list(ie_result['entries'])
ie_result['entries'] = []
self.logger.info(
'not downoading %s videos from this youtube '
'playlist because we expect to capture them from '
'individual watch pages',
len(ie_result['entries_no_dl']))
else: else:
self.logger.info( self.logger.info(
'extractor %r found a video in %s', ie.IE_NAME, url) 'extractor %r found a video in %s', ie.IE_NAME, url)
@ -180,21 +214,17 @@ def _build_youtube_dl(worker, destdir, site):
}) })
def process_info(self, info_dict): def process_info(self, info_dict):
# lock this section to prevent race condition between threads that '''
# want to monkey patch _finish_frag_download() at the same time See comment above on `_finish_frag_download()`
with global_ydl_lock: '''
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download def ffd_callback(ffd_self, ctx):
if worker._using_warcprox(site):
def _finish_frag_download(ffd_self, ctx): self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
_orig__finish_frag_download(ffd_self, ctx) try:
if worker._using_warcprox(site): thread_local.finish_frag_download_callback = ffd_callback
self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx) return super().process_info(info_dict)
finally:
try: delattr(thread_local, 'finish_frag_download_callback')
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
return super().process_info(info_dict)
finally:
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _orig__finish_frag_download
def maybe_heartbeat_site_last_claimed(*args, **kwargs): def maybe_heartbeat_site_last_claimed(*args, **kwargs):
# in case youtube-dl takes a long time, heartbeat site.last_claimed # in case youtube-dl takes a long time, heartbeat site.last_claimed
@ -213,20 +243,24 @@ def _build_youtube_dl(worker, destdir, site):
ydl_opts = { ydl_opts = {
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir), "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
"verbose": False,
"retries": 1, "retries": 1,
"logger": logging.getLogger("youtube_dl"),
"nocheckcertificate": True, "nocheckcertificate": True,
"hls_prefer_native": True, "hls_prefer_native": True,
"noprogress": True, "noprogress": True,
"nopart": True, "nopart": True,
"no_color": True, "no_color": True,
"progress_hooks": [maybe_heartbeat_site_last_claimed], "progress_hooks": [maybe_heartbeat_site_last_claimed],
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection # https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
# "best: Select the best quality format represented by a single # "best: Select the best quality format represented by a single
# file with video and audio." # file with video and audio."
"format": "best/bestvideo+bestaudio", "format": "best/bestvideo+bestaudio",
"youtube_include_dash_manifest": False, "youtube_include_dash_manifest": False,
### we do our own logging
# "logger": logging.getLogger("youtube_dl"),
"verbose": False,
"quiet": True,
} }
if worker._proxy_for(site): if worker._proxy_for(site):
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site)) ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
@ -300,11 +334,12 @@ def _try_youtube_dl(worker, ydl, site, page):
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"), payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())
return ie_result
except brozzler.ShutdownRequested as e: except brozzler.ShutdownRequested as e:
raise raise
except BaseException as e: except Exception as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
pass return None
elif (hasattr(e, "exc_info") elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError and e.exc_info[0] == urllib.error.HTTPError
and hasattr(e.exc_info[1], "code") and hasattr(e.exc_info[1], "code")
@ -331,16 +366,23 @@ def do_youtube_dl(worker, site, page):
page (brozzler.Page): the page we are brozzling page (brozzler.Page): the page we are brozzling
Returns: Returns:
`list` of `dict`: with info about urls fetched: tuple with two entries:
`list` of `dict`: with info about urls fetched:
[{ [{
'url': ..., 'url': ...,
'method': ..., 'method': ...,
'response_code': ..., 'response_code': ...,
'response_headers': ..., 'response_headers': ...,
}, ...] }, ...]
`list` of `str`: outlink urls
''' '''
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = _build_youtube_dl(worker, tempdir, site) ydl = _build_youtube_dl(worker, tempdir, site)
_try_youtube_dl(worker, ydl, site, page) ie_result = _try_youtube_dl(worker, ydl, site, page)
return ydl.fetch_spy.fetches outlinks = set()
if ie_result and ie_result.get('extractor') == 'youtube:playlist':
# youtube watch pages as outlinks
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
for e in ie_result.get('entries_no_dl', [])}
# any outlinks for other cases?
return ydl.fetch_spy.fetches, outlinks

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.5.dev308', version='1.5.dev312',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View file

@ -2,7 +2,7 @@
''' '''
test_brozzling.py - XXX explain test_brozzling.py - XXX explain
Copyright (C) 2016-2017 Internet Archive Copyright (C) 2016-2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.