diff --git a/config/config.example.yml b/config/config.example.yml index e925a5e3..b44fcc0e 100644 --- a/config/config.example.yml +++ b/config/config.example.yml @@ -161,6 +161,19 @@ https_only: false #force_resolve: +## +## Use Innertube's transcripts API instead of timedtext for closed captions +## +## Useful for larger instances as InnerTube is **not ratelimited**. See https://github.com/iv-org/invidious/issues/2567 +## +## Subtitle experience may differ slightly on Invidious. +## +## Accepted values: true, false +## Default: false +## +# use_innertube_for_captions: false + + # ----------------------------- # Logging # ----------------------------- diff --git a/src/invidious/config.cr b/src/invidious/config.cr index cee33ce1..429d9246 100644 --- a/src/invidious/config.cr +++ b/src/invidious/config.cr @@ -127,6 +127,9 @@ class Config # Pool size for HTTP requests to youtube.com and ytimg.com (each domain has a separate pool of `pool_size`) property pool_size : Int32 = 100 + # Use Innertube's transcripts API instead of timedtext for closed captions + property use_innertube_for_captions : Bool = false + # Saved cookies in "name1=value1; name2=value2..." format @[YAML::Field(converter: Preferences::StringToCookies)] property cookies : HTTP::Cookies = HTTP::Cookies.new diff --git a/src/invidious/frontend/watch_page.cr b/src/invidious/frontend/watch_page.cr index e3214469..5fd81168 100644 --- a/src/invidious/frontend/watch_page.cr +++ b/src/invidious/frontend/watch_page.cr @@ -7,7 +7,7 @@ module Invidious::Frontend::WatchPage getter full_videos : Array(Hash(String, JSON::Any)) getter video_streams : Array(Hash(String, JSON::Any)) getter audio_streams : Array(Hash(String, JSON::Any)) - getter captions : Array(Invidious::Videos::Caption) + getter captions : Array(Invidious::Videos::Captions::Metadata) def initialize( @full_videos, diff --git a/src/invidious/routes/api/v1/videos.cr b/src/invidious/routes/api/v1/videos.cr index af4fc806..25e766d2 100644 --- a/src/invidious/routes/api/v1/videos.cr +++ b/src/invidious/routes/api/v1/videos.cr @@ -87,70 +87,78 @@ module Invidious::Routes::API::V1::Videos caption = caption[0] end - url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target + if CONFIG.use_innertube_for_captions + params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated) + initial_data = YoutubeAPI.get_transcript(params) - # Auto-generated captions often have cues that aren't aligned properly with the video, - # as well as some other markup that makes it cumbersome, so we try to fix that here - if caption.name.includes? "auto-generated" - caption_xml = YT_POOL.client &.get(url).body + webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code) + else + # Timedtext API handling + url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target - if caption_xml.starts_with?(" i + 1 - end_time = caption_nodes[i + 1]["start"].to_f.seconds - else - end_time = start_time + duration + if caption_nodes.size > i + 1 + end_time = caption_nodes[i + 1]["start"].to_f.seconds + else + end_time = start_time + duration + end + + start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}" + end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}" + + text = HTML.unescape(node.content) + text = text.gsub(//, "") + text = text.gsub(/<\/font>/, "") + if md = text.match(/(?.*) : (?.*)/) + text = "#{md["text"]}" + end + + str << <<-END_CUE + #{start_time} --> #{end_time} + #{text} + + + END_CUE end - - start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}" - end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}" - - text = HTML.unescape(node.content) - text = text.gsub(//, "") - text = text.gsub(/<\/font>/, "") - if md = text.match(/(?.*) : (?.*)/) - text = "#{md["text"]}" - end - - str << <<-END_CUE - #{start_time} --> #{end_time} - #{text} - - - END_CUE end end - end - else - # Some captions have "align:[start/end]" and "position:[num]%" - # attributes. Those are causing issues with VideoJS, which is unable - # to properly align the captions on the video, so we remove them. - # - # See: https://github.com/iv-org/invidious/issues/2391 - webvtt = YT_POOL.client &.get("#{url}&format=vtt").body - if webvtt.starts_with?(" [0-9:.]{12}).+/, "\\1") + if webvtt.starts_with?(" [0-9:.]{12}).+/, "\\1") + end end end diff --git a/src/invidious/videos.cr b/src/invidious/videos.cr index f38b33e5..9fbd1374 100644 --- a/src/invidious/videos.cr +++ b/src/invidious/videos.cr @@ -24,7 +24,7 @@ struct Video property updated : Time @[DB::Field(ignore: true)] - @captions = [] of Invidious::Videos::Caption + @captions = [] of Invidious::Videos::Captions::Metadata @[DB::Field(ignore: true)] property adaptive_fmts : Array(Hash(String, JSON::Any))? @@ -215,9 +215,9 @@ struct Video keywords.includes? "YouTube Red" end - def captions : Array(Invidious::Videos::Caption) + def captions : Array(Invidious::Videos::Captions::Metadata) if @captions.empty? && @info.has_key?("captions") - @captions = Invidious::Videos::Caption.from_yt_json(info["captions"]) + @captions = Invidious::Videos::Captions::Metadata.from_yt_json(info["captions"]) end return @captions diff --git a/src/invidious/videos/caption.cr b/src/invidious/videos/caption.cr index 13f81a31..256dfcc0 100644 --- a/src/invidious/videos/caption.cr +++ b/src/invidious/videos/caption.cr @@ -1,100 +1,106 @@ require "json" module Invidious::Videos - struct Caption - property name : String - property language_code : String - property base_url : String + module Captions + struct Metadata + property name : String + property language_code : String + property base_url : String - def initialize(@name, @language_code, @base_url) - end + property auto_generated : Bool - # Parse the JSON structure from Youtube - def self.from_yt_json(container : JSON::Any) : Array(Caption) - caption_tracks = container - .dig?("playerCaptionsTracklistRenderer", "captionTracks") - .try &.as_a - - captions_list = [] of Caption - return captions_list if caption_tracks.nil? - - caption_tracks.each do |caption| - name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"] - name = name.to_s.split(" - ")[0] - - language_code = caption["languageCode"].to_s - base_url = caption["baseUrl"].to_s - - captions_list << Caption.new(name, language_code, base_url) + def initialize(@name, @language_code, @base_url, @auto_generated) end - return captions_list - end + # Parse the JSON structure from Youtube + def self.from_yt_json(container : JSON::Any) : Array(Captions::Metadata) + caption_tracks = container + .dig?("playerCaptionsTracklistRenderer", "captionTracks") + .try &.as_a - def timedtext_to_vtt(timedtext : String, tlang = nil) : String - # In the future, we could just directly work with the url. This is more of a POC - cues = [] of XML::Node - tree = XML.parse(timedtext) - tree = tree.children.first + captions_list = [] of Captions::Metadata + return captions_list if caption_tracks.nil? - tree.children.each do |item| - if item.name == "body" - item.children.each do |cue| - if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n") - cues << cue + caption_tracks.each do |caption| + name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"] + name = name.to_s.split(" - ")[0] + + language_code = caption["languageCode"].to_s + base_url = caption["baseUrl"].to_s + + auto_generated = (caption["kind"]? == "asr") + + captions_list << Captions::Metadata.new(name, language_code, base_url, auto_generated) + end + + return captions_list + end + + def timedtext_to_vtt(timedtext : String, tlang = nil) : String + # In the future, we could just directly work with the url. This is more of a POC + cues = [] of XML::Node + tree = XML.parse(timedtext) + tree = tree.children.first + + tree.children.each do |item| + if item.name == "body" + item.children.each do |cue| + if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n") + cues << cue + end end + break end - break end - end - result = String.build do |result| - result << <<-END_VTT - WEBVTT - Kind: captions - Language: #{tlang || @language_code} + result = String.build do |result| + result << <<-END_VTT + WEBVTT + Kind: captions + Language: #{tlang || @language_code} - END_VTT + END_VTT - result << "\n\n" + result << "\n\n" - cues.each_with_index do |node, i| - start_time = node["t"].to_f.milliseconds + cues.each_with_index do |node, i| + start_time = node["t"].to_f.milliseconds - duration = node["d"]?.try &.to_f.milliseconds + duration = node["d"]?.try &.to_f.milliseconds - duration ||= start_time + duration ||= start_time - if cues.size > i + 1 - end_time = cues[i + 1]["t"].to_f.milliseconds - else - end_time = start_time + duration + if cues.size > i + 1 + end_time = cues[i + 1]["t"].to_f.milliseconds + else + end_time = start_time + duration + end + + # start_time + result << start_time.hours.to_s.rjust(2, '0') + result << ':' << start_time.minutes.to_s.rjust(2, '0') + result << ':' << start_time.seconds.to_s.rjust(2, '0') + result << '.' << start_time.milliseconds.to_s.rjust(3, '0') + + result << " --> " + + # end_time + result << end_time.hours.to_s.rjust(2, '0') + result << ':' << end_time.minutes.to_s.rjust(2, '0') + result << ':' << end_time.seconds.to_s.rjust(2, '0') + result << '.' << end_time.milliseconds.to_s.rjust(3, '0') + + result << "\n" + + node.children.each do |s| + result << s.content + end + result << "\n" + result << "\n" end - - # start_time - result << start_time.hours.to_s.rjust(2, '0') - result << ':' << start_time.minutes.to_s.rjust(2, '0') - result << ':' << start_time.seconds.to_s.rjust(2, '0') - result << '.' << start_time.milliseconds.to_s.rjust(3, '0') - - result << " --> " - - # end_time - result << end_time.hours.to_s.rjust(2, '0') - result << ':' << end_time.minutes.to_s.rjust(2, '0') - result << ':' << end_time.seconds.to_s.rjust(2, '0') - result << '.' << end_time.milliseconds.to_s.rjust(3, '0') - - result << "\n" - - node.children.each do |s| - result << s.content - end - result << "\n" - result << "\n" end + return result end - return result end # List of all caption languages available on Youtube. diff --git a/src/invidious/videos/transcript.cr b/src/invidious/videos/transcript.cr new file mode 100644 index 00000000..f3360a52 --- /dev/null +++ b/src/invidious/videos/transcript.cr @@ -0,0 +1,103 @@ +module Invidious::Videos + # Namespace for methods primarily relating to Transcripts + module Transcript + record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String + + def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String + kind = auto_generated ? "asr" : "" + + object = { + "1:0:string" => video_id, + + "2:base64" => { + "1:string" => kind, + "2:string" => language_code, + "3:string" => "", + }, + + "3:varint" => 1_i64, + "5:string" => "engagement-panel-searchable-transcript-search-panel", + "6:varint" => 1_i64, + "7:varint" => 1_i64, + "8:varint" => 1_i64, + } + + params = object.try { |i| Protodec::Any.cast_json(i) } + .try { |i| Protodec::Any.from_json(i) } + .try { |i| Base64.urlsafe_encode(i) } + .try { |i| URI.encode_www_form(i) } + + return params + end + + def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String + # Convert into array of TranscriptLine + lines = self.parse(initial_data) + + # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt() + vtt = String.build do |vtt| + vtt << <<-END_VTT + WEBVTT + Kind: captions + Language: #{target_language} + + + END_VTT + + vtt << "\n\n" + + lines.each do |line| + start_time = line.start_ms + end_time = line.end_ms + + # start_time + vtt << start_time.hours.to_s.rjust(2, '0') + vtt << ':' << start_time.minutes.to_s.rjust(2, '0') + vtt << ':' << start_time.seconds.to_s.rjust(2, '0') + vtt << '.' << start_time.milliseconds.to_s.rjust(3, '0') + + vtt << " --> " + + # end_time + vtt << end_time.hours.to_s.rjust(2, '0') + vtt << ':' << end_time.minutes.to_s.rjust(2, '0') + vtt << ':' << end_time.seconds.to_s.rjust(2, '0') + vtt << '.' << end_time.milliseconds.to_s.rjust(3, '0') + + vtt << "\n" + vtt << line.line + + vtt << "\n" + vtt << "\n" + end + end + + return vtt + end + + private def self.parse(initial_data : Hash(String, JSON::Any)) + body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", + "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", + "initialSegments").as_a + + lines = [] of TranscriptLine + body.each do |line| + # Transcript section headers. They are not apart of the captions and as such we can safely skip them. + if line.as_h.has_key?("transcriptSectionHeaderRenderer") + next + end + + line = line["transcriptSegmentRenderer"] + + start_ms = line["startMs"].as_s.to_i.millisecond + end_ms = line["endMs"].as_s.to_i.millisecond + + text = extract_text(line["snippet"]) || "" + + lines << TranscriptLine.new(start_ms, end_ms, text) + end + + return lines + end + end +end diff --git a/src/invidious/views/user/preferences.ecr b/src/invidious/views/user/preferences.ecr index dfda1434..55349c5a 100644 --- a/src/invidious/views/user/preferences.ecr +++ b/src/invidious/views/user/preferences.ecr @@ -89,7 +89,7 @@ <% preferences.captions.each_with_index do |caption, index| %> diff --git a/src/invidious/yt_backend/youtube_api.cr b/src/invidious/yt_backend/youtube_api.cr index aef9ddd9..a5e621f2 100644 --- a/src/invidious/yt_backend/youtube_api.cr +++ b/src/invidious/yt_backend/youtube_api.cr @@ -557,6 +557,30 @@ module YoutubeAPI return self._post_json("/youtubei/v1/search", data, client_config) end + #################################################################### + # get_transcript(params, client_config?) + # + # Requests the youtubei/v1/get_transcript endpoint with the required headers + # and POST data in order to get a JSON reply. + # + # The requested data is a specially encoded protobuf string that denotes the specific language requested. + # + # An optional ClientConfig parameter can be passed, too (see + # `struct ClientConfig` above for more details). + # + + def get_transcript( + params : String, + client_config : ClientConfig | Nil = nil + ) : Hash(String, JSON::Any) + data = { + "context" => self.make_context(client_config), + "params" => params, + } + + return self._post_json("/youtubei/v1/get_transcript", data, client_config) + end + #################################################################### # _post_json(endpoint, data, client_config?) #