mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-24 06:34:54 -04:00
tidy up some comments and docs
This commit is contained in:
parent
8f9077fbf3
commit
054ba6d7a0
1 changed files with 15 additions and 26 deletions
|
@ -1,8 +1,6 @@
|
||||||
'''
|
'''
|
||||||
brozzler/ydl.py - youtube-dl support for brozzler
|
brozzler/ydl.py - youtube-dl support for brozzler
|
||||||
|
|
||||||
This code was extracted from worker.py and
|
|
||||||
|
|
||||||
Copyright (C) 2018 Internet Archive
|
Copyright (C) 2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
@ -128,7 +126,7 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
- if brozzling through warcprox and downloading segmented videos (e.g.
|
- if brozzling through warcprox and downloading segmented videos (e.g.
|
||||||
HLS), pushes the stitched-up video created by youtube-dl to warcprox
|
HLS), pushes the stitched-up video created by youtube-dl to warcprox
|
||||||
using a WARCPROX_WRITE_RECORD request
|
using a WARCPROX_WRITE_RECORD request
|
||||||
- adds some logging
|
- some logging
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
worker (brozzler.BrozzlerWorker): the calling brozzler worker
|
worker (brozzler.BrozzlerWorker): the calling brozzler worker
|
||||||
|
@ -150,17 +148,6 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
self.logger.debug('fetching %r', url)
|
self.logger.debug('fetching %r', url)
|
||||||
return super().urlopen(req)
|
return super().urlopen(req)
|
||||||
|
|
||||||
# def _match_entry(self, info_dict, incomplete):
|
|
||||||
# if self.dl_disabled:
|
|
||||||
# return 'Downloading disabled (probably youtube playlist)'
|
|
||||||
|
|
||||||
# def extract_info(self, *args, **kwargs):
|
|
||||||
# self.dl_disabled = False
|
|
||||||
# try:
|
|
||||||
# return super().extract_info(*args, **kwargs)
|
|
||||||
# finally:
|
|
||||||
# self.dl_disabled = False
|
|
||||||
|
|
||||||
def add_default_extra_info(self, ie_result, ie, url):
|
def add_default_extra_info(self, ie_result, ie, url):
|
||||||
# hook in some logging
|
# hook in some logging
|
||||||
super().add_default_extra_info(ie_result, ie, url)
|
super().add_default_extra_info(ie_result, ie, url)
|
||||||
|
@ -176,11 +163,10 @@ def _build_youtube_dl(worker, destdir, site):
|
||||||
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
ie_result['entries_no_dl'] = list(ie_result['entries'])
|
||||||
ie_result['entries'] = []
|
ie_result['entries'] = []
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'setting skip_download because this is a youtube '
|
'not downoading %s videos from this youtube '
|
||||||
'playlist (%s entries) and we expect to capture '
|
'playlist because we expect to capture them from '
|
||||||
'videos from individual watch pages',
|
'individual watch pages',
|
||||||
len(ie_result['entries_no_dl']))
|
len(ie_result['entries_no_dl']))
|
||||||
# self.dl_disabled = True
|
|
||||||
else:
|
else:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'extractor %r found a video in %s', ie.IE_NAME, url)
|
'extractor %r found a video in %s', ie.IE_NAME, url)
|
||||||
|
@ -380,20 +366,23 @@ def do_youtube_dl(worker, site, page):
|
||||||
page (brozzler.Page): the page we are brozzling
|
page (brozzler.Page): the page we are brozzling
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
`list` of `dict`: with info about urls fetched:
|
tuple with two entries:
|
||||||
|
`list` of `dict`: with info about urls fetched:
|
||||||
[{
|
[{
|
||||||
'url': ...,
|
'url': ...,
|
||||||
'method': ...,
|
'method': ...,
|
||||||
'response_code': ...,
|
'response_code': ...,
|
||||||
'response_headers': ...,
|
'response_headers': ...,
|
||||||
}, ...]
|
}, ...]
|
||||||
|
`list` of `str`: outlink urls
|
||||||
'''
|
'''
|
||||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
ydl = _build_youtube_dl(worker, tempdir, site)
|
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||||
outlinks = []
|
outlinks = []
|
||||||
if ie_result['extractor'] == 'youtube:playlist':
|
if ie_result['extractor'] == 'youtube:playlist':
|
||||||
|
# youtube watch pages as outlinks
|
||||||
outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
|
outlinks = ['https://www.youtube.com/watch?v=%s' % e['id']
|
||||||
for e in ie_result.get('entries_no_dl', [])]
|
for e in ie_result.get('entries_no_dl', [])]
|
||||||
|
# any outlinks for other cases?
|
||||||
return ydl.fetch_spy.fetches, outlinks
|
return ydl.fetch_spy.fetches, outlinks
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue