Merge pull request #263 from galgeek/yt-dlp-vimeo

yt-dlp: capture postprocessor "Merger" videos
This commit is contained in:
Barbara Miller 2023-10-19 10:23:29 -07:00 committed by GitHub
commit 5b5d4cb062
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 7 additions and 52 deletions

View File

@ -1,7 +1,7 @@
#
# brozzler/behaviors.yaml - behavior configuration
#
# Copyright (C) 2014-2020 Internet Archive
# Copyright (C) 2014-2023 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -63,10 +63,6 @@
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
behavior_js_template: marquette_edu.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?vimeo\.com/.*$'
behavior_js_template: vimeo.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
behavior_js_template: psu24.js

View File

@ -1,41 +0,0 @@
/*
* brozzler/behaviors.d/vimeo.js - behavior for vimeo.com, clicks to play/crawl
* videos
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraState = {'idleSince':null};
var umbraVideoElements = document.getElementsByTagName('video');
for (var i = 0; i < umbraVideoElements.length; i++) {
umbraVideoElements[i].play();
}
umbraState.idleSince = Date.now();
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
// time, then we consider ourselves finished with the page.
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
// Called from outside of this script.
var umbraBehaviorFinished = function() {
if (umbraState.idleSince != null) {
var idleTimeMs = Date.now() - umbraState.idleSince;
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
return true;
}
}
return false;
}

View File

@ -1,6 +1,6 @@
'''
brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
it runs yt-dlp on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier
Copyright (C) 2014-2023 Internet Archive

View File

@ -171,15 +171,15 @@ def _build_youtube_dl(worker, destdir, site, page):
# youtube watch page postprocessor is MoveFiles
if postprocessor == 'FixupM3u8':
if postprocessor == 'FixupM3u8' or postprocessor == 'Merger':
url = 'youtube-dl:%05d:%s' % (
info_dict.get('playlist_index') or 1,
info_dict['webpage_url'])
else:
url = info_dict.get('url')
url = info_dict.get('url', '')
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u*
if url.endswith('.m3u8'):
# skip urls ending .m3u8, to avoid duplicates handled by FixupM3u8
if url.endswith('.m3u8') or url == '':
return
size = os.path.getsize(info_dict['filepath'])
@ -347,7 +347,7 @@ def _try_youtube_dl(worker, ydl, site, page):
except brozzler.ShutdownRequested as e:
raise
except Exception as e:
if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError:
if hasattr(e, "exc_info") and e.exc_info[0] == yt_dlp.utils.UnsupportedError:
return None
elif (hasattr(e, "exc_info")
and e.exc_info[0] == urllib.error.HTTPError