From 49ddf8b6bdb98c7a9678cbec800c45350a54a786 Mon Sep 17 00:00:00 2001 From: techmetx11 Date: Thu, 23 Mar 2023 05:10:21 +0000 Subject: [PATCH 1/5] Added attributed description support --- src/invidious/videos/description.cr | 81 +++++++++++++++++++++++++++++ src/invidious/videos/parser.cr | 6 ++- 2 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 src/invidious/videos/description.cr diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr new file mode 100644 index 00000000..d4c60a84 --- /dev/null +++ b/src/invidious/videos/description.cr @@ -0,0 +1,81 @@ +require "json" +require "uri" + +def parse_command(command : JSON::Any?, string : String) : String? + on_tap = command.dig?("onTap", "innertubeCommand") + + # 3rd party URL, extract original URL from YouTube tracking URL + if url_endpoint = on_tap.try &.["urlEndpoint"]? + youtube_url = URI.parse url_endpoint["url"].as_s + + original_url = youtube_url.query_params["q"]? + if original_url.nil? + return "" + else + return "#{original_url}" + end + # 1st party watch URL + elsif watch_endpoint = on_tap.try &.["watchEndpoint"]? + video_id = watch_endpoint["videoId"].as_s + time = watch_endpoint["startTimeSeconds"].as_i + + url = "/watch?v=#{video_id}&t=#{time}s" + + # if text is a timestamp, use the string instead + if /(?:\d{2}:){1,2}\d{2}/ =~ string + return "#{string}" + else + return "#{url}" + end + # hashtag/other browse URLs + elsif browse_endpoint = on_tap.try &.dig?("commandMetadata", "webCommandMetadata") + url = browse_endpoint["url"].try &.as_s + + # remove unnecessary character in a channel name + if browse_endpoint["webPageType"]?.try &.as_s == "WEB_PAGE_TYPE_CHANNEL" + name = string.match(/@[\w\d]+/) + if name.try &.[0]? + return "#{name.try &.[0]}" + end + end + + return "#{string}" + end + + return "(unknown YouTube desc command)" +end + +def parse_description(desc : JSON::Any?) : String? + if desc.nil? + return "" + end + + content = desc["content"].as_s + if content.empty? + return "" + end + + if commands = desc["commandRuns"]?.try &.as_a + description = String.build do |str| + index = 0 + commands.each do |command| + start_index = command["startIndex"].as_i + length = command["length"].as_i + + if start_index > 0 && start_index - index > 0 + str << content[index..(start_index - 1)] + index += start_index - index + end + + str << parse_command(command, content[start_index, length]) + index += length + end + if index < content.size + str << content[index..content.size] + end + end + return description + end + + return content +end diff --git a/src/invidious/videos/parser.cr b/src/invidious/videos/parser.cr index 608ae99d..3a342a95 100644 --- a/src/invidious/videos/parser.cr +++ b/src/invidious/videos/parser.cr @@ -284,8 +284,10 @@ def parse_video_info(video_id : String, player_response : Hash(String, JSON::Any description = microformat.dig?("description", "simpleText").try &.as_s || "" short_description = player_response.dig?("videoDetails", "shortDescription") - description_html = video_secondary_renderer.try &.dig?("description", "runs") - .try &.as_a.try { |t| content_to_comment_html(t, video_id) } + # description_html = video_secondary_renderer.try &.dig?("description", "runs") + # .try &.as_a.try { |t| content_to_comment_html(t, video_id) } + + description_html = parse_description(video_secondary_renderer.try &.dig?("attributedDescription")) # Video metadata From 7755ed4ac8812377da04cff951324ab31d2e621c Mon Sep 17 00:00:00 2001 From: techmetx11 Date: Thu, 23 Mar 2023 20:12:54 +0000 Subject: [PATCH 2/5] Fix regexs --- src/invidious/videos/description.cr | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr index d4c60a84..3d25197b 100644 --- a/src/invidious/videos/description.cr +++ b/src/invidious/videos/description.cr @@ -21,8 +21,9 @@ def parse_command(command : JSON::Any?, string : String) : String? url = "/watch?v=#{video_id}&t=#{time}s" - # if text is a timestamp, use the string instead - if /(?:\d{2}:){1,2}\d{2}/ =~ string + # if string is a timestamp, use the string instead + # this is a lazy regex for validating timestamps + if /(?:\d{1,2}:){1,2}\d{2}/ =~ string return "#{string}" else return "#{url}" @@ -33,7 +34,7 @@ def parse_command(command : JSON::Any?, string : String) : String? # remove unnecessary character in a channel name if browse_endpoint["webPageType"]?.try &.as_s == "WEB_PAGE_TYPE_CHANNEL" - name = string.match(/@[\w\d]+/) + name = string.match(/@[\w\d.-]+/) if name.try &.[0]? return "#{name.try &.[0]}" end From 73d2ed6f77308dd300e68f3ea059c6aa2c10b1ce Mon Sep 17 00:00:00 2001 From: techmetx11 Date: Wed, 29 Mar 2023 23:33:23 +0000 Subject: [PATCH 3/5] Optimize some redundant stuff --- src/invidious/videos/description.cr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr index 3d25197b..b1d851d3 100644 --- a/src/invidious/videos/description.cr +++ b/src/invidious/videos/description.cr @@ -64,8 +64,8 @@ def parse_description(desc : JSON::Any?) : String? length = command["length"].as_i if start_index > 0 && start_index - index > 0 - str << content[index..(start_index - 1)] - index += start_index - index + str << content[index...start_index] + index = start_index end str << parse_command(command, content[start_index, length]) From b3c0afef02ee13c7f291fd26a5d64b4aee059906 Mon Sep 17 00:00:00 2001 From: Samantaz Fox Date: Wed, 5 Apr 2023 23:43:41 +0200 Subject: [PATCH 4/5] Videos: fix description text offset when emojis are present --- src/invidious/videos/description.cr | 71 +++++++++++++++++++---------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr index b1d851d3..2017955d 100644 --- a/src/invidious/videos/description.cr +++ b/src/invidious/videos/description.cr @@ -46,37 +46,60 @@ def parse_command(command : JSON::Any?, string : String) : String? return "(unknown YouTube desc command)" end -def parse_description(desc : JSON::Any?) : String? - if desc.nil? - return "" +private def copy_string(str : String::Builder, iter : Iterator, count : Int) : Int + copied = 0 + while copied < count + cp = iter.next + break if cp.is_a?(Iterator::Stop) + + str << cp.chr + + # A codepoint from the SMP counts twice + copied += 1 if cp > 0xFFFF + copied += 1 end + return copied +end + +def parse_description(desc : JSON::Any?) : String? + return "" if desc.nil? + content = desc["content"].as_s - if content.empty? - return "" - end + return "" if content.empty? - if commands = desc["commandRuns"]?.try &.as_a - description = String.build do |str| - index = 0 - commands.each do |command| - start_index = command["startIndex"].as_i - length = command["length"].as_i + commands = desc["commandRuns"]?.try &.as_a + return content if commands.nil? - if start_index > 0 && start_index - index > 0 - str << content[index...start_index] - index = start_index - end + # Not everything is stored in UTF-8 on youtube's side. The SMP codepoints + # (0x10000 and above) are encoded as UTF-16 surrogate pairs, which are + # automatically decoded by the JSON parser. It means that we need to count + # copied byte in a special manner, preventing the use of regular string copy. + iter = content.each_codepoint - str << parse_command(command, content[start_index, length]) - index += length - end - if index < content.size - str << content[index..content.size] + index = 0 + + return String.build do |str| + commands.each do |command| + cmd_start = command["startIndex"].as_i + cmd_length = command["length"].as_i + + # Copy the text chunk between this command and the previous if needed. + length = cmd_start - index + index += copy_string(str, iter, length) + + # We need to copy the command's text using the iterator + # and the special function defined above. + cmd_content = String.build(cmd_length) do |str2| + copy_string(str2, iter, cmd_length) end + + str << parse_command(command, cmd_content) + index += cmd_length end - return description - end - return content + # Copy the end of the string (past the last command). + remaining_length = content.size - index + copy_string(str, iter, remaining_length) if remaining_length > 0 + end end From 9a765418d1410ceda3a27ebcd2febd9fe4319edc Mon Sep 17 00:00:00 2001 From: Samantaz Fox Date: Mon, 10 Apr 2023 16:59:13 +0200 Subject: [PATCH 5/5] Update specs --- mocks | 2 +- .../videos/regular_videos_extract_spec.cr | 34 +++++++++---------- .../videos/scheduled_live_extract_spec.cr | 7 ++-- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/mocks b/mocks index cb16e034..11ec372f 160000 --- a/mocks +++ b/mocks @@ -1 +1 @@ -Subproject commit cb16e0343c8f94182615610bfe3c503db89717a7 +Subproject commit 11ec372f72747c09d48ffef04843f72be67d5b54 diff --git a/spec/invidious/videos/regular_videos_extract_spec.cr b/spec/invidious/videos/regular_videos_extract_spec.cr index cbe80010..a6a3e60a 100644 --- a/spec/invidious/videos/regular_videos_extract_spec.cr +++ b/spec/invidious/videos/regular_videos_extract_spec.cr @@ -17,8 +17,8 @@ Spectator.describe "parse_video_info" do # Basic video infos expect(info["title"].as_s).to eq("I Gave My 100,000,000th Subscriber An Island") - expect(info["views"].as_i).to eq(115_784_415) - expect(info["likes"].as_i).to eq(4_932_790) + expect(info["views"].as_i).to eq(126_573_823) + expect(info["likes"].as_i).to eq(5_157_654) # For some reason the video length from VideoDetails and the # one from microformat differs by 1s... @@ -48,12 +48,12 @@ Spectator.describe "parse_video_info" do expect(info["relatedVideos"].as_a.size).to eq(20) - expect(info["relatedVideos"][0]["id"]).to eq("iogcY_4xGjo") - expect(info["relatedVideos"][0]["title"]).to eq("$1 vs $1,000,000 Hotel Room!") + expect(info["relatedVideos"][0]["id"]).to eq("Hwybp38GnZw") + expect(info["relatedVideos"][0]["title"]).to eq("I Built Willy Wonka's Chocolate Factory!") expect(info["relatedVideos"][0]["author"]).to eq("MrBeast") expect(info["relatedVideos"][0]["ucid"]).to eq("UCX6OQ3DkcsbYNE6H8uQQuVA") - expect(info["relatedVideos"][0]["view_count"]).to eq("172972109") - expect(info["relatedVideos"][0]["short_view_count"]).to eq("172M") + expect(info["relatedVideos"][0]["view_count"]).to eq("179877630") + expect(info["relatedVideos"][0]["short_view_count"]).to eq("179M") expect(info["relatedVideos"][0]["author_verified"]).to eq("true") # Description @@ -76,11 +76,11 @@ Spectator.describe "parse_video_info" do expect(info["ucid"].as_s).to eq("UCX6OQ3DkcsbYNE6H8uQQuVA") expect(info["authorThumbnail"].as_s).to eq( - "https://yt3.ggpht.com/ytc/AL5GRJUfhQdJS6n-YJtsAf-ouS2myDavDOq_zXBfebal3Q=s48-c-k-c0x00ffffff-no-rj" + "https://yt3.ggpht.com/ytc/AL5GRJVuqw82ERvHzsmBxL7avr1dpBtsVIXcEzBPZaloFg=s48-c-k-c0x00ffffff-no-rj" ) expect(info["authorVerified"].as_bool).to be_true - expect(info["subCountText"].as_s).to eq("135M") + expect(info["subCountText"].as_s).to eq("143M") end it "parses a regular video with no descrition/comments" do @@ -99,7 +99,7 @@ Spectator.describe "parse_video_info" do # Basic video infos expect(info["title"].as_s).to eq("Chris Rea - Auberge") - expect(info["views"].as_i).to eq(10_698_554) + expect(info["views"].as_i).to eq(10_943_126) expect(info["likes"].as_i).to eq(0) expect(info["lengthSeconds"].as_i).to eq(283_i64) expect(info["published"].as_s).to eq("2012-05-21T00:00:00Z") @@ -132,21 +132,21 @@ Spectator.describe "parse_video_info" do # Related videos - expect(info["relatedVideos"].as_a.size).to eq(18) + expect(info["relatedVideos"].as_a.size).to eq(19) - expect(info["relatedVideos"][0]["id"]).to eq("rfyZrJUmzxU") - expect(info["relatedVideos"][0]["title"]).to eq("cheb mami - bekatni") - expect(info["relatedVideos"][0]["author"]).to eq("pelitovic") - expect(info["relatedVideos"][0]["ucid"]).to eq("UCsp6vFyJeGoLxgn-AsHp1tw") - expect(info["relatedVideos"][0]["view_count"]).to eq("13863619") - expect(info["relatedVideos"][0]["short_view_count"]).to eq("13M") + expect(info["relatedVideos"][0]["id"]).to eq("Ww3KeZ2_Yv4") + expect(info["relatedVideos"][0]["title"]).to eq("Chris Rea") + expect(info["relatedVideos"][0]["author"]).to eq("PanMusic") + expect(info["relatedVideos"][0]["ucid"]).to eq("UCsKAPSuh1iNbLWUga_igPyA") + expect(info["relatedVideos"][0]["view_count"]).to eq("31581") + expect(info["relatedVideos"][0]["short_view_count"]).to eq("31K") expect(info["relatedVideos"][0]["author_verified"]).to eq("false") # Description expect(info["description"].as_s).to eq(" ") expect(info["shortDescription"].as_s).to be_empty - expect(info["descriptionHtml"].as_s).to eq("

") + expect(info["descriptionHtml"].as_s).to eq("") # Video metadata diff --git a/spec/invidious/videos/scheduled_live_extract_spec.cr b/spec/invidious/videos/scheduled_live_extract_spec.cr index 9dd22b97..25e08c51 100644 --- a/spec/invidious/videos/scheduled_live_extract_spec.cr +++ b/spec/invidious/videos/scheduled_live_extract_spec.cr @@ -86,9 +86,10 @@ Spectator.describe "parse_video_info" do expect(info["description"].as_s).to start_with(description_start_text) expect(info["shortDescription"].as_s).to start_with(description_start_text) - expect(info["descriptionHtml"].as_s).to start_with( - "PBD Podcast Episode 241. The home team is ready and at it again with the latest news, interesting topics and trending conversations on topics that matter. Try our sponsor Aura for 14 days free - aura.com/pbd" - ) + # TODO: Update mocks right before the start of PDB podcast, either on friday or saturday (time unknown) + # expect(info["descriptionHtml"].as_s).to start_with( + # "PBD Podcast Episode 241. The home team is ready and at it again with the latest news, interesting topics and trending conversations on topics that matter. Try our sponsor Aura for 14 days free - aura.com/pbd" + # ) # Video metadata