mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-25 00:59:52 -05:00
predup youtube watch pages
This commit is contained in:
parent
376f3139ce
commit
72cb2a6e4a
@ -244,7 +244,7 @@ class BrozzlerWorker:
|
|||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
ydl_fetches = None
|
ydl_fetches = None
|
||||||
outlinks = set()
|
outlinks = set()
|
||||||
if enable_youtube_dl and not page.url.lower().endswith(".pdf"):
|
if enable_youtube_dl and ydl.should_ytdlp(page, site):
|
||||||
try:
|
try:
|
||||||
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_fetches, outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
|
@ -27,11 +27,62 @@ import os
|
|||||||
import json
|
import json
|
||||||
import doublethink
|
import doublethink
|
||||||
import datetime
|
import datetime
|
||||||
|
from cassandra import ReadTimeout
|
||||||
|
from cassandra.cluster import Cluster
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
thread_local = threading.local()
|
thread_local = threading.local()
|
||||||
|
|
||||||
|
|
||||||
|
def _timestamp4datetime(timestamp):
|
||||||
|
"""split `timestamp` into a tuple of 6 integers.
|
||||||
|
|
||||||
|
:param timestamp: full-length timestamp.
|
||||||
|
:type timestamp: bytes
|
||||||
|
"""
|
||||||
|
timestamp = timestamp[:14]
|
||||||
|
return (
|
||||||
|
int(timestamp[:-10]),
|
||||||
|
int(timestamp[-10:-8]),
|
||||||
|
int(timestamp[-8:-6]),
|
||||||
|
int(timestamp[-6:-4]),
|
||||||
|
int(timestamp[-4:-2]),
|
||||||
|
int(timestamp[-2:])
|
||||||
|
)
|
||||||
|
|
||||||
|
def should_ytdlp(page, site):
|
||||||
|
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||||
|
logging.debug("ytdlp_url: %s", ytdlp_url)
|
||||||
|
ytdlp_seed = site.get("warcprox-meta", {}).get("metadata", {}).get("ait_seed_id", "")
|
||||||
|
logging.debug("ytdlp_seed: %s", ytdlp_seed)
|
||||||
|
|
||||||
|
if ".pdf" in ytdlp_url.lower():
|
||||||
|
return False
|
||||||
|
|
||||||
|
if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
|
||||||
|
# connect to bmiller-dev cluster, keyspace video; we can modify default timeout in cassandra.yaml
|
||||||
|
cluster = Cluster(["207.241.235.189"], protocol_version=5)
|
||||||
|
session = cluster.connect("video")
|
||||||
|
containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
|
||||||
|
future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url])
|
||||||
|
logging.debug(f"s:{ytdlp_seed}, {ytdlp_url}")
|
||||||
|
try:
|
||||||
|
record = future.result()
|
||||||
|
logging.debug("record: %s", record)
|
||||||
|
except ReadTimeout:
|
||||||
|
log.exception("Query timed out:")
|
||||||
|
if record and record.video_timestamp:
|
||||||
|
logging.debug(f"video_timestamp: {record.video_timestamp}")
|
||||||
|
ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp))
|
||||||
|
logging.debug("ytdlp_timestamp: %s", ytdlp_timestamp)
|
||||||
|
time_diff = datetime.now() - ytdlp_timestamp
|
||||||
|
# TODO: make veriable for timedelta
|
||||||
|
if time_diff > timedelta(days = 90):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
self.extra_headers = extra_headers
|
self.extra_headers = extra_headers
|
||||||
|
Loading…
x
Reference in New Issue
Block a user