mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-07 22:12:15 -04:00
Merge branch 'master' into qa
* master: handle exceptions extracting links fix reported chromium crash by removing argument bump version after merge remove stray bad logging line tests expect outlinks to be a set tidy up some comments and docs watch pages as outlinks from youtube-dl playlists silence youtube-dl's logging, use only our own use a thread-local callback in monkey-patched skip downloading videos from youtube playlists trace-level logging for all the chrome output
This commit is contained in:
commit
27ba877932
6 changed files with 105 additions and 73 deletions
|
@ -166,7 +166,7 @@ class Chrome:
|
||||||
'--disable-background-networking',
|
'--disable-background-networking',
|
||||||
'--disable-renderer-backgrounding', '--disable-hang-monitor',
|
'--disable-renderer-backgrounding', '--disable-hang-monitor',
|
||||||
'--disable-background-timer-throttling', '--mute-audio',
|
'--disable-background-timer-throttling', '--mute-audio',
|
||||||
'--disable-web-sockets', '--disable-cache', '--single-process',
|
'--disable-web-sockets', '--disable-cache',
|
||||||
'--window-size=1100,900', '--no-default-browser-check',
|
'--window-size=1100,900', '--no-default-browser-check',
|
||||||
'--disable-first-run-ui', '--no-first-run',
|
'--disable-first-run-ui', '--no-first-run',
|
||||||
'--homepage=about:blank', '--disable-direct-npapi-requests',
|
'--homepage=about:blank', '--disable-direct-npapi-requests',
|
||||||
|
@ -244,33 +244,15 @@ class Chrome:
|
||||||
while not self._shutdown.is_set():
|
while not self._shutdown.is_set():
|
||||||
buf = readline_nonblock(self.chrome_process.stdout)
|
buf = readline_nonblock(self.chrome_process.stdout)
|
||||||
if buf:
|
if buf:
|
||||||
if re.search(
|
self.logger.trace(
|
||||||
b'Xlib: extension|'
|
'chrome pid %s STDOUT %s',
|
||||||
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
self.chrome_process.pid, buf)
|
||||||
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
|
||||||
b'ERROR:gpu_child_thread.cc', buf):
|
|
||||||
self.logger.trace(
|
|
||||||
'chrome pid %s STDOUT %s',
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
else:
|
|
||||||
self.logger.debug(
|
|
||||||
'chrome pid %s STDOUT %s',
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
|
|
||||||
buf = readline_nonblock(self.chrome_process.stderr)
|
buf = readline_nonblock(self.chrome_process.stderr)
|
||||||
if buf:
|
if buf:
|
||||||
if re.search(
|
self.logger.trace(
|
||||||
b'Xlib: extension|'
|
'chrome pid %s STDERR %s',
|
||||||
b'CERT_PKIXVerifyCert for [^ ]* failed|'
|
self.chrome_process.pid, buf)
|
||||||
b'^ALSA lib|ERROR:gl_surface_glx.cc|'
|
|
||||||
b'ERROR:gpu_child_thread.cc', buf):
|
|
||||||
self.logger.trace(
|
|
||||||
'chrome pid %s STDOUT %s',
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
else:
|
|
||||||
self.logger.debug(
|
|
||||||
'chrome pid %s STDERR %s',
|
|
||||||
self.chrome_process.pid, buf)
|
|
||||||
except:
|
except:
|
||||||
self.logger.error('unexpected exception', exc_info=True)
|
self.logger.error('unexpected exception', exc_info=True)
|
||||||
|
|
||||||
|
|
|
@ -3,16 +3,22 @@
|
||||||
var __brzl_framesDone = new Set();
|
var __brzl_framesDone = new Set();
|
||||||
var __brzl_compileOutlinks = function(frame) {
|
var __brzl_compileOutlinks = function(frame) {
|
||||||
__brzl_framesDone.add(frame);
|
__brzl_framesDone.add(frame);
|
||||||
if (frame && frame.document) {
|
var outlinks = [];
|
||||||
var outlinks = Array.prototype.slice.call(
|
try {
|
||||||
|
if (frame && frame.document) {
|
||||||
|
outlinks = Array.prototype.slice.call(
|
||||||
frame.document.querySelectorAll('a[href], area[href]'));
|
frame.document.querySelectorAll('a[href], area[href]'));
|
||||||
for (var i = 0; i < frame.frames.length; i++) {
|
for (var i = 0; i < frame.frames.length; i++) {
|
||||||
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
|
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
|
||||||
outlinks = outlinks.concat(
|
outlinks = outlinks.concat(
|
||||||
__brzl_compileOutlinks(frame.frames[i]));
|
__brzl_compileOutlinks(frame.frames[i]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log("exception looking at frame" + frame + ": " + e);
|
||||||
}
|
}
|
||||||
|
|
||||||
return outlinks;
|
return outlinks;
|
||||||
}
|
}
|
||||||
__brzl_compileOutlinks(window).join('\n');
|
__brzl_compileOutlinks(window).join('\n');
|
||||||
|
|
|
@ -186,9 +186,10 @@ class BrozzlerWorker:
|
||||||
on_request=None, enable_youtube_dl=True):
|
on_request=None, enable_youtube_dl=True):
|
||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
ydl_fetches = None
|
ydl_fetches = None
|
||||||
|
outlinks = set()
|
||||||
if enable_youtube_dl:
|
if enable_youtube_dl:
|
||||||
try:
|
try:
|
||||||
ydl_fetches = ydl.do_youtube_dl(self, site, page)
|
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
raise
|
raise
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
|
@ -210,10 +211,10 @@ class BrozzlerWorker:
|
||||||
if self._needs_browsing(page, ydl_fetches):
|
if self._needs_browsing(page, ydl_fetches):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info('needs browsing: %s', page)
|
||||||
try:
|
try:
|
||||||
outlinks = self._browse_page(browser, site, page, on_screenshot,
|
browser_outlinks = self._browse_page(
|
||||||
on_request)
|
browser, site, page, on_screenshot, on_request)
|
||||||
|
outlinks.update(browser_outlinks)
|
||||||
except brozzler.PageInterstitialShown:
|
except brozzler.PageInterstitialShown:
|
||||||
outlinks = []
|
|
||||||
self.logger.info('page interstitial shown (http auth): %s', page)
|
self.logger.info('page interstitial shown (http auth): %s', page)
|
||||||
return outlinks
|
return outlinks
|
||||||
else:
|
else:
|
||||||
|
@ -222,7 +223,8 @@ class BrozzlerWorker:
|
||||||
self._fetch_url(site, page)
|
self._fetch_url(site, page)
|
||||||
else:
|
else:
|
||||||
self.logger.info('already fetched: %s', page)
|
self.logger.info('already fetched: %s', page)
|
||||||
return []
|
|
||||||
|
return outlinks
|
||||||
|
|
||||||
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_png):
|
||||||
|
|
112
brozzler/ydl.py
112
brozzler/ydl.py
|
@ -1,8 +1,6 @@
|
||||||
'''
|
'''
|
||||||
brozzler/ydl.py - youtube-dl support for brozzler
|
brozzler/ydl.py - youtube-dl support for brozzler
|
||||||
|
|
||||||
This code was extracted from worker.py and
|
|
||||||
|
|
||||||
Copyright (C) 2018 Internet Archive
|
Copyright (C) 2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
@ -30,7 +28,21 @@ import doublethink
|
||||||
import datetime
|
import datetime
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
global_ydl_lock = threading.Lock()
|
thread_local = threading.local()
|
||||||
|
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
|
||||||
|
def _finish_frag_download(ffd_self, ctx):
|
||||||
|
'''
|
||||||
|
We monkey-patch this youtube-dl internal method `_finish_frag_download()`
|
||||||
|
because it gets called after downloading the last segment of a segmented
|
||||||
|
video, which is a good time to upload the stitched-up video that youtube-dl
|
||||||
|
creates for us to warcprox. We have it call a thread-local callback
|
||||||
|
since different threads may be youtube-dl'ing at the same time.
|
||||||
|
'''
|
||||||
|
result = _orig__finish_frag_download(ffd_self, ctx)
|
||||||
|
if hasattr(thread_local, 'finish_frag_download_callback'):
|
||||||
|
thread_local.finish_frag_download_callback(ffd_self, ctx)
|
||||||
|
return result
|
||||||
|
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
|
||||||
|
|
||||||
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
|
_orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_read_content
|
||||||
def _webpage_read_content(self, *args, **kwargs):
|
def _webpage_read_content(self, *args, **kwargs):
|
||||||
|
@ -111,9 +123,10 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
|
|
||||||
- keeps track of urls fetched using a `YoutubeDLSpy`
|
- keeps track of urls fetched using a `YoutubeDLSpy`
|
||||||
- periodically updates `site.last_claimed` in rethinkdb
|
- periodically updates `site.last_claimed` in rethinkdb
|
||||||
- if brozzling through warcprox and downloading fragmented (DASH) videos,
|
- if brozzling through warcprox and downloading segmented videos (e.g.
|
||||||
pushes the stitched together video to warcprox using a
|
HLS), pushes the stitched-up video created by youtube-dl to warcprox
|
||||||
WARCPROX_WRITE_RECORD request
|
using a WARCPROX_WRITE_RECORD request
|
||||||
|
- some logging
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
worker (brozzler.BrozzlerWorker): the calling brozzler worker
|
worker (brozzler.BrozzlerWorker): the calling brozzler worker
|
||||||
|
@ -127,12 +140,33 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
class _YoutubeDL(youtube_dl.YoutubeDL):
|
class _YoutubeDL(youtube_dl.YoutubeDL):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
def urlopen(self, req):
|
||||||
|
try:
|
||||||
|
url = req.full_url
|
||||||
|
except AttributeError:
|
||||||
|
url = req
|
||||||
|
self.logger.debug('fetching %r', url)
|
||||||
|
return super().urlopen(req)
|
||||||
|
|
||||||
def add_default_extra_info(self, ie_result, ie, url):
|
def add_default_extra_info(self, ie_result, ie, url):
|
||||||
# hook in some logging
|
# hook in some logging
|
||||||
super().add_default_extra_info(ie_result, ie, url)
|
super().add_default_extra_info(ie_result, ie, url)
|
||||||
if ie_result.get('_type') == 'playlist':
|
if ie_result.get('_type') == 'playlist':
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
'extractor %r found playlist in %s', ie.IE_NAME, url)
|
||||||
|
if ie.IE_NAME == 'youtube:playlist':
|
||||||
|
# At this point ie_result['entries'] is an iterator that
|
||||||
|
# will fetch more metadata from youtube to list all the
|
||||||
|
# videos. We unroll that iterator here partly because
|
||||||
|
# otherwise `process_ie_result()` will clobber it, and we
|
||||||
|
# use it later to extract the watch pages as outlinks.
|
||||||
|
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
||||||
|
ie_result['entries'] = []
|
||||||
|
self.logger.info(
|
||||||
|
'not downoading %s videos from this youtube '
|
||||||
|
'playlist because we expect to capture them from '
|
||||||
|
'individual watch pages',
|
||||||
|
len(ie_result['entries_no_dl']))
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'extractor %r found a video in %s', ie.IE_NAME, url)
|
'extractor %r found a video in %s', ie.IE_NAME, url)
|
||||||
|
@ -180,21 +214,17 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
})
|
})
|
||||||
|
|
||||||
def process_info(self, info_dict):
|
def process_info(self, info_dict):
|
||||||
# lock this section to prevent race condition between threads that
|
'''
|
||||||
# want to monkey patch _finish_frag_download() at the same time
|
See comment above on `_finish_frag_download()`
|
||||||
with global_ydl_lock:
|
'''
|
||||||
_orig__finish_frag_download = youtube_dl.downloader.fragment.FragmentFD._finish_frag_download
|
def ffd_callback(ffd_self, ctx):
|
||||||
|
if worker._using_warcprox(site):
|
||||||
def _finish_frag_download(ffd_self, ctx):
|
self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
|
||||||
_orig__finish_frag_download(ffd_self, ctx)
|
try:
|
||||||
if worker._using_warcprox(site):
|
thread_local.finish_frag_download_callback = ffd_callback
|
||||||
self._push_stitched_up_vid_to_warcprox(site, info_dict, ctx)
|
return super().process_info(info_dict)
|
||||||
|
finally:
|
||||||
try:
|
delattr(thread_local, 'finish_frag_download_callback')
|
||||||
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _finish_frag_download
|
|
||||||
return super().process_info(info_dict)
|
|
||||||
finally:
|
|
||||||
youtube_dl.downloader.fragment.FragmentFD._finish_frag_download = _orig__finish_frag_download
|
|
||||||
|
|
||||||
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
def maybe_heartbeat_site_last_claimed(*args, **kwargs):
|
||||||
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
||||||
|
@ -213,20 +243,24 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
|
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
|
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
|
||||||
"verbose": False,
|
|
||||||
"retries": 1,
|
"retries": 1,
|
||||||
"logger": logging.getLogger("youtube_dl"),
|
|
||||||
"nocheckcertificate": True,
|
"nocheckcertificate": True,
|
||||||
"hls_prefer_native": True,
|
"hls_prefer_native": True,
|
||||||
"noprogress": True,
|
"noprogress": True,
|
||||||
"nopart": True,
|
"nopart": True,
|
||||||
"no_color": True,
|
"no_color": True,
|
||||||
"progress_hooks": [maybe_heartbeat_site_last_claimed],
|
"progress_hooks": [maybe_heartbeat_site_last_claimed],
|
||||||
|
|
||||||
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
|
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
|
||||||
# "best: Select the best quality format represented by a single
|
# "best: Select the best quality format represented by a single
|
||||||
# file with video and audio."
|
# file with video and audio."
|
||||||
"format": "best/bestvideo+bestaudio",
|
"format": "best/bestvideo+bestaudio",
|
||||||
"youtube_include_dash_manifest": False,
|
"youtube_include_dash_manifest": False,
|
||||||
|
|
||||||
|
### we do our own logging
|
||||||
|
# "logger": logging.getLogger("youtube_dl"),
|
||||||
|
"verbose": False,
|
||||||
|
"quiet": True,
|
||||||
}
|
}
|
||||||
if worker._proxy_for(site):
|
if worker._proxy_for(site):
|
||||||
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
|
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
|
||||||
|
@ -300,11 +334,12 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||||
payload=info_json.encode("utf-8"),
|
payload=info_json.encode("utf-8"),
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
|
return ie_result
|
||||||
except brozzler.ShutdownRequested as e:
|
except brozzler.ShutdownRequested as e:
|
||||||
raise
|
raise
|
||||||
except BaseException as e:
|
except Exception as e:
|
||||||
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
|
||||||
pass
|
return None
|
||||||
elif (hasattr(e, "exc_info")
|
elif (hasattr(e, "exc_info")
|
||||||
and e.exc_info[0] == urllib.error.HTTPError
|
and e.exc_info[0] == urllib.error.HTTPError
|
||||||
and hasattr(e.exc_info[1], "code")
|
and hasattr(e.exc_info[1], "code")
|
||||||
|
@ -331,16 +366,23 @@ def do_youtube_dl(worker, site, page):
|
||||||
page (brozzler.Page): the page we are brozzling
|
page (brozzler.Page): the page we are brozzling
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`list` of `dict`: with info about urls fetched:
|
tuple with two entries:
|
||||||
|
`list` of `dict`: with info about urls fetched:
|
||||||
[{
|
[{
|
||||||
'url': ...,
|
'url': ...,
|
||||||
'method': ...,
|
'method': ...,
|
||||||
'response_code': ...,
|
'response_code': ...,
|
||||||
'response_headers': ...,
|
'response_headers': ...,
|
||||||
}, ...]
|
}, ...]
|
||||||
|
`list` of `str`: outlink urls
|
||||||
'''
|
'''
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
ydl = _build_youtube_dl(worker, tempdir, site)
|
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||||
_try_youtube_dl(worker, ydl, site, page)
|
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||||
return ydl.fetch_spy.fetches
|
outlinks = set()
|
||||||
|
if ie_result and ie_result.get('extractor') == 'youtube:playlist':
|
||||||
|
# youtube watch pages as outlinks
|
||||||
|
outlinks = {'https://www.youtube.com/watch?v=%s' % e['id']
|
||||||
|
for e in ie_result.get('entries_no_dl', [])}
|
||||||
|
# any outlinks for other cases?
|
||||||
|
return ydl.fetch_spy.fetches, outlinks
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.5.dev308',
|
version='1.5.dev312',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
'''
|
'''
|
||||||
test_brozzling.py - XXX explain
|
test_brozzling.py - XXX explain
|
||||||
|
|
||||||
Copyright (C) 2016-2017 Internet Archive
|
Copyright (C) 2016-2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue