Refactor search extractor

This commit is contained in:
Omar Roth 2019-08-21 18:23:20 -05:00
parent e768e1e277
commit 9f9cc1ffb5
No known key found for this signature in database
GPG Key ID: B8254FB7EC3D37F2
3 changed files with 48 additions and 59 deletions

View File

@ -5167,7 +5167,7 @@ get "/vi/:id/:name" do |env|
end end
end end
# Undocumented, creates anonymous playlist with specified 'video_ids' # Undocumented, creates anonymous playlist with specified 'video_ids', max 50 videos
get "/watch_videos" do |env| get "/watch_videos" do |env|
client = make_client(YT_URL) client = make_client(YT_URL)

View File

@ -387,14 +387,15 @@ def fetch_channel_playlists(ucid, author, auto_generated, continuation, sort_by)
html = XML.parse_html(json["content_html"].as_s) html = XML.parse_html(json["content_html"].as_s)
nodeset = html.xpath_nodes(%q(//li[contains(@class, "feed-item-container")])) nodeset = html.xpath_nodes(%q(//li[contains(@class, "feed-item-container")]))
else elsif auto_generated
url = "/channel/#{ucid}/playlists?disable_polymer=1&flow=list" url = "/channel/#{ucid}"
if auto_generated response = client.get(url)
url += "&view=50" html = XML.parse_html(response.body)
else
url += "&view=1" nodeset = html.xpath_nodes(%q(//ul[@id="browse-items-primary"]/li[contains(@class, "feed-item-container")]))
end else
url = "/channel/#{ucid}/playlists?disable_polymer=1&flow=list&view=1"
case sort_by case sort_by
when "last", "last_added" when "last", "last_added"

View File

@ -442,47 +442,20 @@ def extract_items(nodeset, ucid = nil, author_name = nil)
else else
id = id.lchop("/watch?v=") id = id.lchop("/watch?v=")
metadata = node.xpath_nodes(%q(.//div[contains(@class,"yt-lockup-meta")]/ul/li)) metadata = node.xpath_node(%q(.//div[contains(@class,"yt-lockup-meta")]/ul))
begin published = metadata.try &.xpath_node(%q(.//li[contains(text(), " ago")])).try { |node| decode_date(node.content.sub(/^[a-zA-Z]+ /, "")) }
published = decode_date(metadata[0].content.lchop("Streamed ").lchop("Starts ")) published ||= metadata.try &.xpath_node(%q(.//span[@data-timestamp])).try { |node| Time.unix(node["data-timestamp"].to_i64) }
rescue ex
end
begin
published ||= Time.unix(metadata[0].xpath_node(%q(.//span)).not_nil!["data-timestamp"].to_i64)
rescue ex
end
published ||= Time.utc published ||= Time.utc
begin view_count = metadata.try &.xpath_node(%q(.//li[contains(text(), " views")])).try &.content.gsub(/\D/, "").to_i64?
view_count = metadata[0].content.rchop(" watching").delete(",").try &.to_i64?
rescue ex
end
begin
view_count ||= metadata.try &.[1].content.delete("No views,").try &.to_i64?
rescue ex
end
view_count ||= 0_i64 view_count ||= 0_i64
length_seconds = node.xpath_node(%q(.//span[@class="video-time"])) length_seconds = node.xpath_node(%q(.//span[@class="video-time"])).try { |node| decode_length_seconds(node.content) }
if length_seconds length_seconds ||= -1
length_seconds = decode_length_seconds(length_seconds.content)
else
length_seconds = -1
end
live_now = node.xpath_node(%q(.//span[contains(@class, "yt-badge-live")])) live_now = node.xpath_node(%q(.//span[contains(@class, "yt-badge-live")])) ? true : false
if live_now premium = node.xpath_node(%q(.//span[text()="Premium"])) ? true : false
live_now = true
else
live_now = false
end
if node.xpath_node(%q(.//span[text()="Premium"]))
premium = true
else
premium = false
end
if !premium || node.xpath_node(%q(.//span[contains(text(), "Free episode")])) if !premium || node.xpath_node(%q(.//span[contains(text(), "Free episode")]))
paid = false paid = false
@ -520,26 +493,18 @@ def extract_shelf_items(nodeset, ucid = nil, author_name = nil)
nodeset.each do |shelf| nodeset.each do |shelf|
shelf_anchor = shelf.xpath_node(%q(.//h2[contains(@class, "branded-page-module-title")])) shelf_anchor = shelf.xpath_node(%q(.//h2[contains(@class, "branded-page-module-title")]))
next if !shelf_anchor
if !shelf_anchor title = shelf_anchor.xpath_node(%q(.//span[contains(@class, "branded-page-module-title-text")])).try &.content.strip
next
end
title = shelf_anchor.xpath_node(%q(.//span[contains(@class, "branded-page-module-title-text")]))
if title
title = title.content.strip
end
title ||= "" title ||= ""
id = shelf_anchor.xpath_node(%q(.//a)).try &.["href"] id = shelf_anchor.xpath_node(%q(.//a)).try &.["href"]
if !id next if !id
next
end
is_playlist = false shelf_is_playlist = false
videos = [] of SearchPlaylistVideo videos = [] of SearchPlaylistVideo
shelf.xpath_nodes(%q(.//ul[contains(@class, "yt-uix-shelfslider-list")]/li)).each do |child_node| shelf.xpath_nodes(%q(.//ul[contains(@class, "yt-uix-shelfslider-list") or contains(@class, "expanded-shelf-content-list")]/li)).each do |child_node|
type = child_node.xpath_node(%q(./div)) type = child_node.xpath_node(%q(./div))
if !type if !type
next next
@ -547,7 +512,7 @@ def extract_shelf_items(nodeset, ucid = nil, author_name = nil)
case type["class"] case type["class"]
when .includes? "yt-lockup-video" when .includes? "yt-lockup-video"
is_playlist = true shelf_is_playlist = true
anchor = child_node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a)) anchor = child_node.xpath_node(%q(.//h3[contains(@class, "yt-lockup-title")]/a))
if anchor if anchor
@ -588,19 +553,42 @@ def extract_shelf_items(nodeset, ucid = nil, author_name = nil)
end end
video_count ||= 50 video_count ||= 50
videos = [] of SearchPlaylistVideo
child_node.xpath_nodes(%q(.//*[contains(@class, "yt-lockup-playlist-items")]/li)).each do |video|
anchor = video.xpath_node(%q(.//a))
if anchor
video_title = anchor.content.strip
id = HTTP::Params.parse(URI.parse(anchor["href"]).query.not_nil!)["v"]
end
video_title ||= ""
id ||= ""
anchor = video.xpath_node(%q(.//span/span))
if anchor
length_seconds = decode_length_seconds(anchor.content)
end
length_seconds ||= 0
videos << SearchPlaylistVideo.new(
video_title,
id,
length_seconds
)
end
items << SearchPlaylist.new( items << SearchPlaylist.new(
playlist_title, playlist_title,
plid, plid,
author_name, author_name,
ucid, ucid,
video_count, video_count,
Array(SearchPlaylistVideo).new, videos,
playlist_thumbnail playlist_thumbnail
) )
end end
end end
if is_playlist if shelf_is_playlist
plid = HTTP::Params.parse(URI.parse(id).query.not_nil!)["list"] plid = HTTP::Params.parse(URI.parse(id).query.not_nil!)["list"]
items << SearchPlaylist.new( items << SearchPlaylist.new(