diff --git a/spec/helpers_spec.cr b/spec/helpers_spec.cr index 26922bb2..fe16e716 100644 --- a/spec/helpers_spec.cr +++ b/spec/helpers_spec.cr @@ -41,7 +41,7 @@ describe "Helper" do describe "#extract_channel_playlists_cursor" do it "correctly extracts a playlists cursor from the given URL" do - extract_channel_playlists_cursor("/browse_ajax?continuation=4qmFsgLRARIYVUNDajk1NklGNjJGYlQ3R291c3phajl3GrQBRWdsd2JHRjViR2x6ZEhNWUF5QUJNQUk0QVdBQmFnQjZabEZWYkZCaE1XczFVbFpHZDJGV09XNWxWelI0V0RGR2VWSnVWbUZOV0Vwc1ZHcG5lRmd3TVU1aVZXdDRWMWN4YzFGdFNuTmtlbWh4VGpCd1NWTllVa1pTYTJNeFlVUmtlRmt3Y0ZWVWJWRXdWbnBzTkU1V1JqRmhNVGxFVm14dmQwMXFhRzVXZDdnQkFBJTNEJTNE&gl=US&hl=en", false).should eq("AIOkY9EQpi_gyn1_QrFuZ1reN81_MMmI1YmlBblw8j7JHItEFG5h7qcJTNd4W9x5Quk_CVZ028gW") + extract_channel_playlists_cursor("4qmFsgLRARIYVUNDajk1NklGNjJGYlQ3R291c3phajl3GrQBRWdsd2JHRjViR2x6ZEhNWUF5QUJNQUk0QVdBQmFnQjZabEZWYkZCaE1XczFVbFpHZDJGV09XNWxWelI0V0RGR2VWSnVWbUZOV0Vwc1ZHcG5lRmd3TVU1aVZXdDRWMWN4YzFGdFNuTmtlbWh4VGpCd1NWTllVa1pTYTJNeFlVUmtlRmt3Y0ZWVWJWRXdWbnBzTkU1V1JqRmhNVGxFVm14dmQwMXFhRzVXZDdnQkFBJTNEJTNE", false).should eq("AIOkY9EQpi_gyn1_QrFuZ1reN81_MMmI1YmlBblw8j7JHItEFG5h7qcJTNd4W9x5Quk_CVZ028gW") end end diff --git a/src/invidious/channels.cr b/src/invidious/channels.cr index cbfa521d..e7bcf00e 100644 --- a/src/invidious/channels.cr +++ b/src/invidious/channels.cr @@ -216,30 +216,18 @@ def fetch_channel(ucid, db, pull_all_videos = true, locale = nil) url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated) response = YT_POOL.client &.get(url) + videos = [] of SearchVideo begin - json = JSON.parse(response.body) + initial_data = JSON.parse(response.body).as_a.find &.["response"]? + raise "Could not extract JSON" if !initial_data + videos = extract_videos(initial_data.as_h, author, ucid) rescue ex if response.body.includes?("To continue with your YouTube experience, please fill out the form below.") || response.body.includes?("https://www.google.com/sorry/index") raise "Could not extract channel info. Instance is likely blocked." end - - raise "Could not extract JSON" end - if json["content_html"]? && !json["content_html"].as_s.empty? - document = XML.parse_html(json["content_html"].as_s) - nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")])) - - if auto_generated - videos = extract_videos_html(nodeset) - else - videos = extract_videos_html(nodeset, ucid, author) - end - end - - videos ||= [] of ChannelVideo - rss.xpath_nodes("//feed/entry").each do |entry| video_id = entry.xpath_node("videoid").not_nil!.content title = entry.xpath_node("title").not_nil!.content @@ -305,24 +293,11 @@ def fetch_channel(ucid, db, pull_all_videos = true, locale = nil) loop do url = produce_channel_videos_url(ucid, page, auto_generated: auto_generated) response = YT_POOL.client &.get(url) - json = JSON.parse(response.body) + initial_data = JSON.parse(response.body).as_a.find &.["response"]? + raise "Could not extract JSON" if !initial_data + videos = extract_videos(initial_data.as_h, author, ucid) - if json["content_html"]? && !json["content_html"].as_s.empty? - document = XML.parse_html(json["content_html"].as_s) - nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")])) - else - break - end - - nodeset = nodeset.not_nil! - - if auto_generated - videos = extract_videos_html(nodeset) - else - videos = extract_videos_html(nodeset, ucid, author) - end - - count = nodeset.size + count = videos.size videos = videos.map { |video| ChannelVideo.new( id: video.id, title: video.title, @@ -387,23 +362,11 @@ def fetch_channel_playlists(ucid, author, auto_generated, continuation, sort_by) url = produce_channel_playlists_url(ucid, continuation, sort_by, auto_generated) response = YT_POOL.client &.get(url) - json = JSON.parse(response.body) - if json["load_more_widget_html"].as_s.empty? - continuation = nil - else - continuation = XML.parse_html(json["load_more_widget_html"].as_s) - continuation = continuation.xpath_node(%q(//button[@data-uix-load-more-href])) - - if continuation - continuation = extract_channel_playlists_cursor(continuation["data-uix-load-more-href"], auto_generated) - end - end - - html = XML.parse_html(json["content_html"].as_s) - nodeset = html.xpath_nodes(%q(//li[contains(@class, "feed-item-container")])) + continuation = response.body.match(/"continuation":"(?[^"]+)"/).try &.["continuation"]? + initial_data = JSON.parse(response.body).as_a.find(&.["response"]?).try &.as_h else - url = "/channel/#{ucid}/playlists?disable_polymer=1&flow=list&view=1" + url = "/channel/#{ucid}/playlists?flow=list&view=1" case sort_by when "last", "last_added" @@ -416,21 +379,13 @@ def fetch_channel_playlists(ucid, author, auto_generated, continuation, sort_by) end response = YT_POOL.client &.get(url) - html = XML.parse_html(response.body) - - continuation = html.xpath_node(%q(//button[@data-uix-load-more-href])) - if continuation - continuation = extract_channel_playlists_cursor(continuation["data-uix-load-more-href"], auto_generated) - end - - nodeset = html.xpath_nodes(%q(//ul[@id="browse-items-primary"]/li[contains(@class, "feed-item-container")])) + continuation = response.body.match(/"continuation":"(?[^"]+)"/).try &.["continuation"]? + initial_data = extract_initial_data(response.body) end - if auto_generated - items = extract_shelf_items(nodeset, ucid, author) - else - items = extract_items_html(nodeset, ucid, author) - end + return [] of SearchItem, nil if !initial_data + items = extract_items(initial_data) + continuation = extract_channel_playlists_cursor(continuation, auto_generated) if continuation return items, continuation end @@ -530,9 +485,8 @@ def produce_channel_playlists_url(ucid, cursor, sort = "newest", auto_generated return "/browse_ajax?continuation=#{continuation}&gl=US&hl=en" end -def extract_channel_playlists_cursor(url, auto_generated) - cursor = URI.parse(url).query_params - .try { |i| URI.decode_www_form(i["continuation"]) } +def extract_channel_playlists_cursor(cursor, auto_generated) + cursor = URI.decode_www_form(cursor) .try { |i| Base64.decode(i) } .try { |i| IO::Memory.new(i) } .try { |i| Protodec::Any.parse(i) } @@ -949,25 +903,19 @@ def get_60_videos(ucid, author, page, auto_generated, sort_by = "newest") response = YT_POOL.client &.get(url) initial_data = JSON.parse(response.body).as_a.find &.["response"]? break if !initial_data - videos.concat extract_videos(initial_data.as_h) + videos.concat extract_videos(initial_data.as_h, author, ucid) end return videos.size, videos end def get_latest_videos(ucid) - videos = [] of SearchVideo - url = produce_channel_videos_url(ucid, 0) response = YT_POOL.client &.get(url) - json = JSON.parse(response.body) + initial_data = JSON.parse(response.body).as_a.find &.["response"]? + return [] of SearchVideo if !initial_data + author = initial_data["response"]?.try &.["metadata"]?.try &.["channelMetadataRenderer"]?.try &.["title"]?.try &.as_s + items = extract_videos(initial_data.as_h, author, ucid) - if json["content_html"]? && !json["content_html"].as_s.empty? - document = XML.parse_html(json["content_html"].as_s) - nodeset = document.xpath_nodes(%q(//li[contains(@class, "feed-item-container")])) - - videos = extract_videos_html(nodeset, ucid) - end - - return videos + return items end diff --git a/src/invidious/helpers/helpers.cr b/src/invidious/helpers/helpers.cr index 7a251052..aaec19c5 100644 --- a/src/invidious/helpers/helpers.cr +++ b/src/invidious/helpers/helpers.cr @@ -313,32 +313,30 @@ def html_to_content(description_html : String) return description end -def extract_videos(initial_data : Hash(String, JSON::Any)) - extract_items(initial_data).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) +def extract_videos(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) + extract_items(initial_data, author_fallback, author_id_fallback).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) end -def extract_items(initial_data : Hash(String, JSON::Any)) +def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : String? = nil, author_id_fallback : String? = nil) items = [] of SearchItem - initial_data.try { |t| - t["contents"]? || t["response"]? - }.try { |t| - t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a[0]?.try &.["tabRenderer"]["content"] || + initial_data.try { |t| t["contents"]? || t["response"]? } + .try { |t| t["twoColumnBrowseResultsRenderer"]?.try &.["tabs"].as_a.select(&.["tabRenderer"]?.try &.["selected"].as_bool)[0]?.try &.["tabRenderer"]["content"] || t["twoColumnSearchResultsRenderer"]?.try &.["primaryContents"] || - t["continuationContents"]? - }.try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? } - .try &.["contents"] - .as_a.each { |c| - c.try &.["itemSectionRenderer"]["contents"].as_a - .try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a || t } + t["continuationContents"]? } + .try { |t| t["sectionListRenderer"]? || t["sectionListContinuation"]? } + .try &.["contents"].as_a + .each { |c| c.try &.["itemSectionRenderer"]["contents"].as_a + .try { |t| t[0]?.try &.["shelfRenderer"]?.try &.["content"]["expandedShelfContentsRenderer"]?.try &.["items"].as_a || + t[0]?.try &.["gridRenderer"]?.try &.["items"].as_a || t } .each { |item| if i = item["videoRenderer"]? video_id = i["videoId"].as_s title = i["title"].try { |t| t["simpleText"]?.try &.as_s || t["runs"]?.try &.as_a.map(&.["text"].as_s).join("") } || "" author_info = i["ownerText"]?.try &.["runs"].as_a[0]? - author = author_info.try &.["text"].as_s || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || "" + author = author_info.try &.["text"].as_s || author_fallback || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || "" published = i["publishedTimeText"]?.try &.["simpleText"]?.try { |t| decode_date(t.as_s) } || Time.local view_count = i["viewCountText"]?.try &.["simpleText"]?.try &.as_s.gsub(/\D+/, "").to_i64? || 0_i64 @@ -382,8 +380,8 @@ def extract_items(initial_data : Hash(String, JSON::Any)) premiere_timestamp: premiere_timestamp ) elsif i = item["channelRenderer"]? - author = i["title"]["simpleText"]?.try &.as_s || "" - author_id = i["channelId"]?.try &.as_s || "" + author = i["title"]["simpleText"]?.try &.as_s || author_fallback || "" + author_id = i["channelId"]?.try &.as_s || author_id_fallback || "" author_thumbnail = i["thumbnail"]["thumbnails"]?.try &.as_a[0]?.try { |u| "https:#{u["url"]}" } || "" subscriber_count = i["subscriberCountText"]?.try &.["simpleText"]?.try &.as_s.try { |s| short_text_to_number(s.split(" ")[0]) } || 0 @@ -409,9 +407,9 @@ def extract_items(initial_data : Hash(String, JSON::Any)) video_count = i["videoCount"]?.try &.as_s.to_i || 0 playlist_thumbnail = i["thumbnails"].as_a[0]?.try &.["thumbnails"]?.try &.as_a[0]?.try &.["url"].as_s || "" - author_info = i["shortBylineText"]["runs"].as_a[0]? - author = author_info.try &.["text"].as_s || "" - author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || "" + author_info = i["shortBylineText"]?.try &.["runs"].as_a[0]? + author = author_info.try &.["text"].as_s || author_fallback || "" + author_id = author_info.try &.["navigationEndpoint"]?.try &.["browseEndpoint"]["browseId"].as_s || author_id_fallback || "" videos = i["videos"]?.try &.as_a.map do |v| v = v["childVideoRenderer"] @@ -444,297 +442,11 @@ def extract_items(initial_data : Hash(String, JSON::Any)) elsif i = item["horizontalCardListRenderer"]? elsif i = item["searchPyvRenderer"]? # Ad end - } - } + } } items end -def extract_videos_html(nodeset, ucid = nil, author_name = nil) - extract_items_html(nodeset, ucid, author_name).select(&.is_a?(SearchVideo)).map(&.as(SearchVideo)) -end - -def extract_items_html(nodeset, ucid = nil, author_name = nil) - # TODO: Make this a 'CommonItem', so it makes more sense to be used here - items = [] of SearchItem - - nodeset.each do |node| - anchor = node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a)) - if !anchor - next - end - title = anchor.content.strip - id = anchor["href"] - - if anchor["href"].starts_with? "https://www.googleadservices.com" - next - end - - author_id = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a)).try &.["href"].split("/")[-1] || ucid || "" - author = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-byline")]/a)).try &.content.strip || author_name || "" - description_html = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-description")])).try &.to_s || "" - - tile = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-tile")])) - if !tile - next - end - - case tile["class"] - when .includes? "yt-lockup-playlist" - plid = HTTP::Params.parse(URI.parse(id).query.not_nil!)["list"] - - anchor = node.xpath_node(%q(.//div[contains(@class, "yt-lockup-meta")]/a)) - - if !anchor - anchor = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li/a)) - end - - video_count = node.xpath_node(%q(.//span[@class="formatted-video-count-label"]/b)) || - node.xpath_node(%q(.//span[@class="formatted-video-count-label"])) - if video_count - video_count = video_count.content - - if video_count == "50+" - author = "YouTube" - author_id = "UC-9-kyTW8ZkZNDHQJ6FgpwQ" - end - - video_count = video_count.gsub(/\D/, "").to_i? - end - video_count ||= 0 - - videos = [] of SearchPlaylistVideo - node.xpath_nodes(%q(.//*[contains(@class, "yt-lockup-playlist-items")]/li)).each do |video| - anchor = video.xpath_node(%q(.//a)) - if anchor - video_title = anchor.content.strip - id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"] - end - video_title ||= "" - id ||= "" - - anchor = video.xpath_node(%q(.//span/span)) - if anchor - length_seconds = decode_length_seconds(anchor.content) - end - length_seconds ||= 0 - - videos << SearchPlaylistVideo.new( - video_title, - id, - length_seconds - ) - end - - playlist_thumbnail = node.xpath_node(%q(.//span/img)).try &.["data-thumb"]? - playlist_thumbnail ||= node.xpath_node(%q(.//span/img)).try &.["src"] - - items << SearchPlaylist.new( - title: title, - id: plid, - author: author, - ucid: author_id, - video_count: video_count, - videos: videos, - thumbnail: playlist_thumbnail - ) - when .includes? "yt-lockup-channel" - author = title.strip - - ucid = node.xpath_node(%q(.//button[contains(@class, "yt-uix-subscription-button")])).try &.["data-channel-external-id"]? - ucid ||= id.split("/")[-1] - - author_thumbnail = node.xpath_node(%q(.//div/span/img)).try &.["data-thumb"]? - author_thumbnail ||= node.xpath_node(%q(.//div/span/img)).try &.["src"] - if author_thumbnail - author_thumbnail = URI.parse(author_thumbnail) - author_thumbnail.scheme = "https" - author_thumbnail = author_thumbnail.to_s - end - - author_thumbnail ||= "" - - subscriber_count = node.xpath_node(%q(.//span[contains(@class, "subscriber-count")])) - .try &.["title"].try { |text| short_text_to_number(text) } || 0 - - video_count = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li)).try &.content.split(" ")[0].gsub(/\D/, "").to_i? - - items << SearchChannel.new( - author: author, - ucid: ucid, - author_thumbnail: author_thumbnail, - subscriber_count: subscriber_count, - video_count: video_count || 0, - description_html: description_html, - auto_generated: video_count ? false : true, - ) - else - id = id.lchop("/watch?v=") - - metadata = node.xpath_node(%q(.//div[contains(@class,"yt-lockup-meta")]/ul)) - - published = metadata.try &.xpath_node(%q(.//li[contains(text(), " ago")])).try { |node| decode_date(node.content.sub(/^[a-zA-Z]+ /, "")) } - published ||= metadata.try &.xpath_node(%q(.//span[@data-timestamp])).try { |node| Time.unix(node["data-timestamp"].to_i64) } - published ||= Time.utc - - view_count = metadata.try &.xpath_node(%q(.//li[contains(text(), " views")])).try &.content.gsub(/\D/, "").to_i64? - view_count ||= 0_i64 - - length_seconds = node.xpath_node(%q(.//span[@class="video-time"])).try { |node| decode_length_seconds(node.content) } - length_seconds ||= -1 - - live_now = node.xpath_node(%q(.//span[contains(@class, "yt-badge-live")])) ? true : false - premium = node.xpath_node(%q(.//span[text()="Premium"])) ? true : false - - if !premium || node.xpath_node(%q(.//span[contains(text(), "Free episode")])) - paid = false - else - paid = true - end - - premiere_timestamp = node.xpath_node(%q(.//ul[@class="yt-lockup-meta-info"]/li/span[@class="localized-date"])).try &.["data-timestamp"]?.try &.to_i64? - if premiere_timestamp - premiere_timestamp = Time.unix(premiere_timestamp) - end - - items << SearchVideo.new( - title: title, - id: id, - author: author, - ucid: author_id, - published: published, - views: view_count, - description_html: description_html, - length_seconds: length_seconds, - live_now: live_now, - paid: paid, - premium: premium, - premiere_timestamp: premiere_timestamp - ) - end - end - - return items -end - -def extract_shelf_items(nodeset, ucid = nil, author_name = nil) - items = [] of SearchPlaylist - - nodeset.each do |shelf| - shelf_anchor = shelf.xpath_node(%q(.//h2[contains(@class, "branded-page-module-title")])) - next if !shelf_anchor - - title = shelf_anchor.xpath_node(%q(.//span[contains(@class, "branded-page-module-title-text")])).try &.content.strip - title ||= "" - - id = shelf_anchor.xpath_node(%q(.//a)).try &.["href"] - next if !id - - shelf_is_playlist = false - videos = [] of SearchPlaylistVideo - - shelf.xpath_nodes(%q(.//ul[contains(@class, "yt-uix-shelfslider-list") or contains(@class, "expanded-shelf-content-list")]/li)).each do |child_node| - type = child_node.xpath_node(%q(./div)) - next if !type - - case type["class"] - when .includes? "yt-lockup-video" - shelf_is_playlist = true - - anchor = child_node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a)) - if anchor - video_title = anchor.content.strip - video_id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"] - end - video_title ||= "" - video_id ||= "" - - anchor = child_node.xpath_node(%q(.//span[@class="video-time"])) - if anchor - length_seconds = decode_length_seconds(anchor.content) - end - length_seconds ||= 0 - - videos << SearchPlaylistVideo.new( - video_title, - video_id, - length_seconds - ) - when .includes? "yt-lockup-playlist" - anchor = child_node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a)) - if anchor - playlist_title = anchor.content.strip - params = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!) - plid = params["list"] - end - playlist_title ||= "" - plid ||= "" - - playlist_thumbnail = child_node.xpath_node(%q(.//span/img)).try &.["data-thumb"]? - playlist_thumbnail ||= child_node.xpath_node(%q(.//span/img)).try &.["src"] - - video_count = child_node.xpath_node(%q(.//span[@class="formatted-video-count-label"]/b)) || - child_node.xpath_node(%q(.//span[@class="formatted-video-count-label"])) - if video_count - video_count = video_count.content.gsub(/\D/, "").to_i? - end - video_count ||= 50 - - videos = [] of SearchPlaylistVideo - child_node.xpath_nodes(%q(.//*[contains(@class, "yt-lockup-playlist-items")]/li)).each do |video| - anchor = video.xpath_node(%q(.//a)) - if anchor - video_title = anchor.content.strip - id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"] - end - video_title ||= "" - id ||= "" - - anchor = video.xpath_node(%q(.//span/span)) - if anchor - length_seconds = decode_length_seconds(anchor.content) - end - length_seconds ||= 0 - - videos << SearchPlaylistVideo.new( - video_title, - id, - length_seconds - ) - end - - items << SearchPlaylist.new( - title: playlist_title, - id: plid, - author: author_name, - ucid: ucid, - video_count: video_count, - videos: videos, - thumbnail: playlist_thumbnail - ) - else - next # Skip - end - end - - if shelf_is_playlist - plid = HTTP::Params.parse(URI.parse(id).query.not_nil!)["list"] - - items << SearchPlaylist.new( - title: title, - id: plid, - author: author_name, - ucid: ucid, - video_count: videos.size, - videos: videos, - thumbnail: "https://i.ytimg.com/vi/#{videos[0].id}/mqdefault.jpg" - ) - end - end - - return items -end - def check_enum(db, logger, enum_name, struct_type = nil) return # TODO if !db.query_one?("SELECT true FROM pg_type WHERE typname = $1", enum_name, as: Bool) diff --git a/src/invidious/search.cr b/src/invidious/search.cr index b4bd6226..92baed0b 100644 --- a/src/invidious/search.cr +++ b/src/invidious/search.cr @@ -243,7 +243,8 @@ def channel_search(query, page, channel) response = YT_POOL.client &.get(url) initial_data = JSON.parse(response.body).as_a.find &.["response"]? return 0, [] of SearchItem if !initial_data - items = extract_items(initial_data.as_h) + author = initial_data["response"]?.try &.["metadata"]?.try &.["channelMetadataRenderer"]?.try &.["title"]?.try &.as_s + items = extract_items(initial_data.as_h, author, ucid) return items.size, items end diff --git a/src/invidious/users.cr b/src/invidious/users.cr index ba15692c..f3cfafa3 100644 --- a/src/invidious/users.cr +++ b/src/invidious/users.cr @@ -286,7 +286,7 @@ end # headers = HTTP::Headers.new # headers["Cookie"] = env_headers["Cookie"] # -# html = YT_POOL.client &.get("/view_all_playlists?disable_polymer=1", headers) +# html = YT_POOL.client &.get("/view_all_playlists", headers) # # cookies = HTTP::Cookies.from_headers(headers) # html.cookies.each do |cookie|