Merge branch 'iv-org:master' into main

2025-04-20 15:45:46 -04:00 · 2023-09-22 02:13:57 +00:00 · 2023-09-22 02:13:57 +00:00 · cd3256fcff
commit cd3256fcff
parent 75cc1899cf bb14f79496
14 changed files with 305 additions and 140 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -38,11 +38,10 @@ jobs:
      matrix:
        stable: [true]
        crystal:
-          - 1.4.1
-          - 1.5.1
          - 1.6.2
          - 1.7.3
          - 1.8.2
+          - 1.9.2
        include:
          - crystal: nightly
            stable: false
@ -53,7 +52,7 @@ jobs:
          submodules: true

      - name: Install Crystal
-        uses: crystal-lang/install-crystal@v1.7.0
+        uses: crystal-lang/install-crystal@v1.8.0
        with:
          crystal: ${{ matrix.crystal }}

--- a/.github/workflows/container-release.yml
+++ b/.github/workflows/container-release.yml
@ -25,9 +25,9 @@ jobs:
        uses: actions/checkout@v3

      - name: Install Crystal
-        uses: crystal-lang/install-crystal@v1.6.0
+        uses: crystal-lang/install-crystal@v1.8.0
        with:
-          crystal: 1.5.0
+          crystal: 1.9.2

      - name: Run lint
        run: |
@ -77,4 +77,3 @@ jobs:
          tags: quay.io/invidious/invidious:${{ github.sha }}-arm64,quay.io/invidious/invidious:latest-arm64
          build-args: |
            "release=1"
-        
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -14,7 +14,7 @@ jobs:
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
        days-before-stale: 365
-        days-before-pr-stale: 45 # PRs should be active. Anything that hasn't had activity in more than 45 days should be considered abandoned.
+        days-before-pr-stale: 90 
        days-before-close: 30
        exempt-pr-labels: blocked
        stale-issue-message: 'This issue has been automatically marked as stale and will be closed in 30 days because it has not had recent activity and is much likely outdated. If you think this issue is still relevant and applicable, you just have to post a comment and it will be unmarked.'
--- a/config/config.example.yml
+++ b/config/config.example.yml
@ -161,6 +161,19 @@ https_only: false
 #force_resolve:


+##
+## Use Innertube's transcripts API instead of timedtext for closed captions
+##
+## Useful for larger instances as InnerTube is **not ratelimited**. See https://github.com/iv-org/invidious/issues/2567
+##
+## Subtitle experience may differ slightly on Invidious.
+##
+## Accepted values: true, false
+## Default: false
+##
+# use_innertube_for_captions: false
+
+
 # -----------------------------
 #  Logging
 # -----------------------------
--- a/src/invidious/config.cr
+++ b/src/invidious/config.cr
@ -127,6 +127,9 @@ class Config
  # Pool size for HTTP requests to youtube.com and ytimg.com (each domain has a separate pool of `pool_size`)
  property pool_size : Int32 = 100

+  # Use Innertube's transcripts API instead of timedtext for closed captions
+  property use_innertube_for_captions : Bool = false
+
  # Saved cookies in "name1=value1; name2=value2..." format
  @[YAML::Field(converter: Preferences::StringToCookies)]
  property cookies : HTTP::Cookies = HTTP::Cookies.new
--- a/src/invidious/frontend/watch_page.cr
+++ b/src/invidious/frontend/watch_page.cr
@ -7,7 +7,7 @@ module Invidious::Frontend::WatchPage
    getter full_videos : Array(Hash(String, JSON::Any))
    getter video_streams : Array(Hash(String, JSON::Any))
    getter audio_streams : Array(Hash(String, JSON::Any))
-    getter captions : Array(Invidious::Videos::Caption)
+    getter captions : Array(Invidious::Videos::Captions::Metadata)

    def initialize(
      @full_videos,
--- a/src/invidious/playlists.cr
+++ b/src/invidious/playlists.cr
@ -89,6 +89,7 @@ struct Playlist
  property views : Int64
  property updated : Time
  property thumbnail : String?
+  property subtitle : String?

  def to_json(offset, json : JSON::Builder, video_id : String? = nil)
    json.object do
@ -100,6 +101,7 @@ struct Playlist
      json.field "author", self.author
      json.field "authorId", self.ucid
      json.field "authorUrl", "/channel/#{self.ucid}"
+      json.field "subtitle", self.subtitle

      json.field "authorThumbnails" do
        json.array do
@ -356,6 +358,8 @@ def fetch_playlist(plid : String)
  updated = Time.utc
  video_count = 0

+  subtitle = extract_text(initial_data.dig?("header", "playlistHeaderRenderer", "subtitle"))
+
  playlist_info["stats"]?.try &.as_a.each do |stat|
    text = stat["runs"]?.try &.as_a.map(&.["text"].as_s).join("") || stat["simpleText"]?.try &.as_s
    next if !text
@ -397,6 +401,7 @@ def fetch_playlist(plid : String)
    views:            views,
    updated:          updated,
    thumbnail:        thumbnail,
+    subtitle:         subtitle,
  })
 end

--- a/src/invidious/routes/api/v1/videos.cr
+++ b/src/invidious/routes/api/v1/videos.cr
@ -87,70 +87,78 @@ module Invidious::Routes::API::V1::Videos
      caption = caption[0]
    end

-    url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
+    if CONFIG.use_innertube_for_captions
+      params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
+      initial_data = YoutubeAPI.get_transcript(params)

-    # Auto-generated captions often have cues that aren't aligned properly with the video,
-    # as well as some other markup that makes it cumbersome, so we try to fix that here
-    if caption.name.includes? "auto-generated"
-      caption_xml = YT_POOL.client &.get(url).body
+      webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
+    else
+      # Timedtext API handling
+      url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target

-      if caption_xml.starts_with?("<?xml")
-        webvtt = caption.timedtext_to_vtt(caption_xml, tlang)
-      else
-        caption_xml = XML.parse(caption_xml)
+      # Auto-generated captions often have cues that aren't aligned properly with the video,
+      # as well as some other markup that makes it cumbersome, so we try to fix that here
+      if caption.name.includes? "auto-generated"
+        caption_xml = YT_POOL.client &.get(url).body

-        webvtt = String.build do |str|
-          str << <<-END_VTT
-          WEBVTT
-          Kind: captions
-          Language: #{tlang || caption.language_code}
+        if caption_xml.starts_with?("<?xml")
+          webvtt = caption.timedtext_to_vtt(caption_xml, tlang)
+        else
+          caption_xml = XML.parse(caption_xml)
+
+          webvtt = String.build do |str|
+            str << <<-END_VTT
+            WEBVTT
+            Kind: captions
+            Language: #{tlang || caption.language_code}


-          END_VTT
+            END_VTT

-          caption_nodes = caption_xml.xpath_nodes("//transcript/text")
-          caption_nodes.each_with_index do |node, i|
-            start_time = node["start"].to_f.seconds
-            duration = node["dur"]?.try &.to_f.seconds
-            duration ||= start_time
+            caption_nodes = caption_xml.xpath_nodes("//transcript/text")
+            caption_nodes.each_with_index do |node, i|
+              start_time = node["start"].to_f.seconds
+              duration = node["dur"]?.try &.to_f.seconds
+              duration ||= start_time

-            if caption_nodes.size > i + 1
-              end_time = caption_nodes[i + 1]["start"].to_f.seconds
-            else
-              end_time = start_time + duration
+              if caption_nodes.size > i + 1
+                end_time = caption_nodes[i + 1]["start"].to_f.seconds
+              else
+                end_time = start_time + duration
+              end
+
+              start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}"
+              end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}"
+
+              text = HTML.unescape(node.content)
+              text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "")
+              text = text.gsub(/<\/font>/, "")
+              if md = text.match(/(?<name>.*) : (?<text>.*)/)
+                text = "<v #{md["name"]}>#{md["text"]}</v>"
+              end
+
+              str << <<-END_CUE
+              #{start_time} --> #{end_time}
+              #{text}
+
+
+              END_CUE
            end
-
-            start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}"
-            end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}"
-
-            text = HTML.unescape(node.content)
-            text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "")
-            text = text.gsub(/<\/font>/, "")
-            if md = text.match(/(?<name>.*) : (?<text>.*)/)
-              text = "<v #{md["name"]}>#{md["text"]}</v>"
-            end
-
-            str << <<-END_CUE
-            #{start_time} --> #{end_time}
-            #{text}
-
-
-            END_CUE
          end
        end
-      end
-    else
-      # Some captions have "align:[start/end]" and "position:[num]%"
-      # attributes. Those are causing issues with VideoJS, which is unable
-      # to properly align the captions on the video, so we remove them.
-      #
-      # See: https://github.com/iv-org/invidious/issues/2391
-      webvtt = YT_POOL.client &.get("#{url}&format=vtt").body
-      if webvtt.starts_with?("<?xml")
-        webvtt = caption.timedtext_to_vtt(webvtt)
      else
+        # Some captions have "align:[start/end]" and "position:[num]%"
+        # attributes. Those are causing issues with VideoJS, which is unable
+        # to properly align the captions on the video, so we remove them.
+        #
+        # See: https://github.com/iv-org/invidious/issues/2391
        webvtt = YT_POOL.client &.get("#{url}&format=vtt").body
-          .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1")
+        if webvtt.starts_with?("<?xml")
+          webvtt = caption.timedtext_to_vtt(webvtt)
+        else
+          webvtt = YT_POOL.client &.get("#{url}&format=vtt").body
+            .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1")
+        end
      end
    end

--- a/src/invidious/videos.cr
+++ b/src/invidious/videos.cr
@ -24,7 +24,7 @@ struct Video
  property updated : Time

  @[DB::Field(ignore: true)]
-  @captions = [] of Invidious::Videos::Caption
+  @captions = [] of Invidious::Videos::Captions::Metadata

  @[DB::Field(ignore: true)]
  property adaptive_fmts : Array(Hash(String, JSON::Any))?
@ -215,9 +215,9 @@ struct Video
    keywords.includes? "YouTube Red"
  end

-  def captions : Array(Invidious::Videos::Caption)
+  def captions : Array(Invidious::Videos::Captions::Metadata)
    if @captions.empty? && @info.has_key?("captions")
-      @captions = Invidious::Videos::Caption.from_yt_json(info["captions"])
+      @captions = Invidious::Videos::Captions::Metadata.from_yt_json(info["captions"])
    end

    return @captions
--- a/src/invidious/videos/caption.cr
+++ b/src/invidious/videos/caption.cr
@ -1,100 +1,106 @@
 require "json"

 module Invidious::Videos
-  struct Caption
-    property name : String
-    property language_code : String
-    property base_url : String
+  module Captions
+    struct Metadata
+      property name : String
+      property language_code : String
+      property base_url : String

-    def initialize(@name, @language_code, @base_url)
-    end
+      property auto_generated : Bool

-    # Parse the JSON structure from Youtube
-    def self.from_yt_json(container : JSON::Any) : Array(Caption)
-      caption_tracks = container
-        .dig?("playerCaptionsTracklistRenderer", "captionTracks")
-        .try &.as_a
-
-      captions_list = [] of Caption
-      return captions_list if caption_tracks.nil?
-
-      caption_tracks.each do |caption|
-        name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"]
-        name = name.to_s.split(" - ")[0]
-
-        language_code = caption["languageCode"].to_s
-        base_url = caption["baseUrl"].to_s
-
-        captions_list << Caption.new(name, language_code, base_url)
+      def initialize(@name, @language_code, @base_url, @auto_generated)
      end

-      return captions_list
-    end
+      # Parse the JSON structure from Youtube
+      def self.from_yt_json(container : JSON::Any) : Array(Captions::Metadata)
+        caption_tracks = container
+          .dig?("playerCaptionsTracklistRenderer", "captionTracks")
+          .try &.as_a

-    def timedtext_to_vtt(timedtext : String, tlang = nil) : String
-      # In the future, we could just directly work with the url. This is more of a POC
-      cues = [] of XML::Node
-      tree = XML.parse(timedtext)
-      tree = tree.children.first
+        captions_list = [] of Captions::Metadata
+        return captions_list if caption_tracks.nil?

-      tree.children.each do |item|
-        if item.name == "body"
-          item.children.each do |cue|
-            if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n")
-              cues << cue
+        caption_tracks.each do |caption|
+          name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"]
+          name = name.to_s.split(" - ")[0]
+
+          language_code = caption["languageCode"].to_s
+          base_url = caption["baseUrl"].to_s
+
+          auto_generated = (caption["kind"]? == "asr")
+
+          captions_list << Captions::Metadata.new(name, language_code, base_url, auto_generated)
+        end
+
+        return captions_list
+      end
+
+      def timedtext_to_vtt(timedtext : String, tlang = nil) : String
+        # In the future, we could just directly work with the url. This is more of a POC
+        cues = [] of XML::Node
+        tree = XML.parse(timedtext)
+        tree = tree.children.first
+
+        tree.children.each do |item|
+          if item.name == "body"
+            item.children.each do |cue|
+              if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n")
+                cues << cue
+              end
            end
+            break
          end
-          break
        end
-      end
-      result = String.build do |result|
-        result << <<-END_VTT
-        WEBVTT
-        Kind: captions
-        Language: #{tlang || @language_code}
+        result = String.build do |result|
+          result << <<-END_VTT
+          WEBVTT
+          Kind: captions
+          Language: #{tlang || @language_code}


-        END_VTT
+          END_VTT

-        result << "\n\n"
+          result << "\n\n"

-        cues.each_with_index do |node, i|
-          start_time = node["t"].to_f.milliseconds
+          cues.each_with_index do |node, i|
+            start_time = node["t"].to_f.milliseconds

-          duration = node["d"]?.try &.to_f.milliseconds
+            duration = node["d"]?.try &.to_f.milliseconds

-          duration ||= start_time
+            duration ||= start_time

-          if cues.size > i + 1
-            end_time = cues[i + 1]["t"].to_f.milliseconds
-          else
-            end_time = start_time + duration
+            if cues.size > i + 1
+              end_time = cues[i + 1]["t"].to_f.milliseconds
+            else
+              end_time = start_time + duration
+            end
+
+            # start_time
+            result << start_time.hours.to_s.rjust(2, '0')
+            result << ':' << start_time.minutes.to_s.rjust(2, '0')
+            result << ':' << start_time.seconds.to_s.rjust(2, '0')
+            result << '.' << start_time.milliseconds.to_s.rjust(3, '0')
+
+            result << " --> "
+
+            # end_time
+            result << end_time.hours.to_s.rjust(2, '0')
+            result << ':' << end_time.minutes.to_s.rjust(2, '0')
+            result << ':' << end_time.seconds.to_s.rjust(2, '0')
+            result << '.' << end_time.milliseconds.to_s.rjust(3, '0')
+
+            result << "\n"
+
+            node.children.each do |s|
+              result << s.content
+            end
+            result << "\n"
+            result << "\n"
          end
-
-          # start_time
-          result << start_time.hours.to_s.rjust(2, '0')
-          result << ':' << start_time.minutes.to_s.rjust(2, '0')
-          result << ':' << start_time.seconds.to_s.rjust(2, '0')
-          result << '.' << start_time.milliseconds.to_s.rjust(3, '0')
-
-          result << " --> "
-
-          # end_time
-          result << end_time.hours.to_s.rjust(2, '0')
-          result << ':' << end_time.minutes.to_s.rjust(2, '0')
-          result << ':' << end_time.seconds.to_s.rjust(2, '0')
-          result << '.' << end_time.milliseconds.to_s.rjust(3, '0')
-
-          result << "\n"
-
-          node.children.each do |s|
-            result << s.content
-          end
-          result << "\n"
-          result << "\n"
        end
+        return result
      end
-      return result
    end

    # List of all caption languages available on Youtube.
--- a/src/invidious/videos/transcript.cr
+++ b/src/invidious/videos/transcript.cr
@ -0,0 +1,103 @@
+module Invidious::Videos
+  # Namespace for methods primarily relating to Transcripts
+  module Transcript
+    record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+
+    def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
+      kind = auto_generated ? "asr" : ""
+
+      object = {
+        "1:0:string" => video_id,
+
+        "2:base64" => {
+          "1:string" => kind,
+          "2:string" => language_code,
+          "3:string" => "",
+        },
+
+        "3:varint" => 1_i64,
+        "5:string" => "engagement-panel-searchable-transcript-search-panel",
+        "6:varint" => 1_i64,
+        "7:varint" => 1_i64,
+        "8:varint" => 1_i64,
+      }
+
+      params = object.try { |i| Protodec::Any.cast_json(i) }
+        .try { |i| Protodec::Any.from_json(i) }
+        .try { |i| Base64.urlsafe_encode(i) }
+        .try { |i| URI.encode_www_form(i) }
+
+      return params
+    end
+
+    def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
+      # Convert into array of TranscriptLine
+      lines = self.parse(initial_data)
+
+      # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
+      vtt = String.build do |vtt|
+        vtt << <<-END_VTT
+        WEBVTT
+        Kind: captions
+        Language: #{target_language}
+
+
+        END_VTT
+
+        vtt << "\n\n"
+
+        lines.each do |line|
+          start_time = line.start_ms
+          end_time = line.end_ms
+
+          # start_time
+          vtt << start_time.hours.to_s.rjust(2, '0')
+          vtt << ':' << start_time.minutes.to_s.rjust(2, '0')
+          vtt << ':' << start_time.seconds.to_s.rjust(2, '0')
+          vtt << '.' << start_time.milliseconds.to_s.rjust(3, '0')
+
+          vtt << " --> "
+
+          # end_time
+          vtt << end_time.hours.to_s.rjust(2, '0')
+          vtt << ':' << end_time.minutes.to_s.rjust(2, '0')
+          vtt << ':' << end_time.seconds.to_s.rjust(2, '0')
+          vtt << '.' << end_time.milliseconds.to_s.rjust(3, '0')
+
+          vtt << "\n"
+          vtt << line.line
+
+          vtt << "\n"
+          vtt << "\n"
+        end
+      end
+
+      return vtt
+    end
+
+    private def self.parse(initial_data : Hash(String, JSON::Any))
+      body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
+        "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
+        "initialSegments").as_a
+
+      lines = [] of TranscriptLine
+      body.each do |line|
+        # Transcript section headers. They are not apart of the captions and as such we can safely skip them.
+        if line.as_h.has_key?("transcriptSectionHeaderRenderer")
+          next
+        end
+
+        line = line["transcriptSegmentRenderer"]
+
+        start_ms = line["startMs"].as_s.to_i.millisecond
+        end_ms = line["endMs"].as_s.to_i.millisecond
+
+        text = extract_text(line["snippet"]) || ""
+
+        lines << TranscriptLine.new(start_ms, end_ms, text)
+      end
+
+      return lines
+    end
+  end
+end
--- a/src/invidious/views/playlist.ecr
+++ b/src/invidious/views/playlist.ecr
@ -70,7 +70,12 @@
            </b>
        <% else %>
            <b>
-                <a href="/channel/<%= playlist.ucid %>"><%= author %></a> |
+                <% if !author.empty? %>
+                    <a href="/channel/<%= playlist.ucid %>"><%= author %></a> |
+                <% elsif !playlist.subtitle.nil? %>
+                    <% subtitle = playlist.subtitle || "" %>
+                    <span><%= HTML.escape(subtitle[0..subtitle.rindex(" • ") || subtitle.size]) %></span> |
+                <% end %>
                <%= translate_count(locale, "generic_videos_count", playlist.video_count) %> |
                <%= translate(locale, "Updated `x` ago", recode_date(playlist.updated, locale)) %>
            </b>
--- a/src/invidious/views/user/preferences.ecr
+++ b/src/invidious/views/user/preferences.ecr
@ -89,7 +89,7 @@
                <label for="captions[0]"><%= translate(locale, "preferences_captions_label") %></label>
                <% preferences.captions.each_with_index do |caption, index| %>
                    <select class="pure-u-1-6" name="captions[<%= index %>]" id="captions[<%= index %>]">
-                        <% Invidious::Videos::Caption::LANGUAGES.each do |option| %>
+                        <% Invidious::Videos::Captions::LANGUAGES.each do |option| %>
                            <option value="<%= option %>" <% if preferences.captions[index] == option %> selected <% end %>><%= translate(locale, option.blank? ? "none" : option) %></option>
                        <% end %>
                    </select>
--- a/src/invidious/yt_backend/youtube_api.cr
+++ b/src/invidious/yt_backend/youtube_api.cr
@ -557,6 +557,30 @@ module YoutubeAPI
    return self._post_json("/youtubei/v1/search", data, client_config)
  end

+  ####################################################################
+  # get_transcript(params, client_config?)
+  #
+  # Requests the youtubei/v1/get_transcript endpoint with the required headers
+  # and POST data in order to get a JSON reply.
+  #
+  # The requested data is a specially encoded protobuf string that denotes the specific language requested.
+  #
+  # An optional ClientConfig parameter can be passed, too (see
+  # `struct ClientConfig` above for more details).
+  #
+
+  def get_transcript(
+    params : String,
+    client_config : ClientConfig | Nil = nil
+  ) : Hash(String, JSON::Any)
+    data = {
+      "context" => self.make_context(client_config),
+      "params"  => params,
+    }
+
+    return self._post_json("/youtubei/v1/get_transcript", data, client_config)
+  end
+
  ####################################################################
  # _post_json(endpoint, data, client_config?)
  #