Add language detection for top videos

This commit is contained in:
Omar Roth 2018-03-16 19:36:49 -05:00
parent dde7a643e9
commit 997449ab4b
3 changed files with 38 additions and 5 deletions

View File

@ -15,6 +15,9 @@ dependencies:
pg: pg:
github: will/crystal-pg github: will/crystal-pg
branch: master branch: master
detect_language:
github: omarroth/detectlanguage-crystal
branch: v0.24.2
crystal: 0.24.2 crystal: 0.24.2

View File

@ -25,6 +25,7 @@ class Config
dbname: String, dbname: String,
), ),
redirect: Bool, redirect: Bool,
dl_api_key: String | Nil,
}) })
end end
@ -229,7 +230,7 @@ def decrypt_signature(a)
return a.join("") return a.join("")
end end
def rank_videos(db, n) def rank_videos(db, n, pool, filter)
top = [] of {Float64, String} top = [] of {Float64, String}
db.query("SELECT id, wilson_score, published FROM videos WHERE views > 5000 ORDER BY published DESC LIMIT 10000") do |rs| db.query("SELECT id, wilson_score, published FROM videos WHERE views > 5000 ORDER BY published DESC LIMIT 10000") do |rs|
@ -250,9 +251,31 @@ def rank_videos(db, n)
top.reverse! top.reverse!
top = top.map { |a, b| b } top = top.map { |a, b| b }
# Return top if filter
language_list = [] of String
top.each do |id|
if language_list.size == n
break
else
client = get_client(pool)
video = get_video(id, client, db)
pool << client
description = XML.parse(video.description)
content = [video.title, description.content].join(" ")
results = DetectLanguage.detect(content)
if results[0].language == "en"
language_list << id
end
end
end
return language_list
else
return top[0..n - 1] return top[0..n - 1]
end end
end
def make_client(url) def make_client(url)
context = OpenSSL::SSL::Context::Client.new context = OpenSSL::SSL::Context::Client.new

View File

@ -14,6 +14,7 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
require "detect_language"
require "kemal" require "kemal"
require "option_parser" require "option_parser"
require "pg" require "pg"
@ -135,8 +136,14 @@ end
top_videos = [] of Video top_videos = [] of Video
spawn do spawn do
if CONFIG.dl_api_key
DetectLanguage.configure do |config|
config.api_key = CONFIG.dl_api_key.not_nil!
end
end
loop do loop do
top = rank_videos(PG_DB, 40) top = rank_videos(PG_DB, 40, youtube_pool, true)
if top.size > 0 if top.size > 0
args = arg_array(top) args = arg_array(top)