mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-25 00:59:52 -05:00
Merge branch 'video_predup' into qa
This commit is contained in:
commit
707ee8bd8e
@ -252,7 +252,7 @@ class BrozzlerWorker:
|
|||||||
except brozzler.PageInterstitialShown:
|
except brozzler.PageInterstitialShown:
|
||||||
self.logger.info("page interstitial shown (http auth): %s", page)
|
self.logger.info("page interstitial shown (http auth): %s", page)
|
||||||
|
|
||||||
if enable_youtube_dl and ydl.should_ytdlp(self, page, site):
|
if enable_youtube_dl and ydl.should_ytdlp(page, site):
|
||||||
try:
|
try:
|
||||||
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
|
@ -62,10 +62,10 @@ def _timestamp4datetime(timestamp):
|
|||||||
int(timestamp[-2:])
|
int(timestamp[-2:])
|
||||||
)
|
)
|
||||||
|
|
||||||
def should_ytdlp(worker, page, site):
|
def should_ytdlp(page, site):
|
||||||
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
ytdlp_url = page.redirect_url if page.redirect_url else page.url
|
||||||
ytdlp_seed = site["metadata"]["ait_seed_id"]
|
ytdlp_seed = site["metadata"]["ait_seed_id"]
|
||||||
logging.info("checking containing page %r, site %r", ytdlp_url, ytdlp_seed)
|
logging.info("checking containing page %s for seed %s", ytdlp_url, ytdlp_seed)
|
||||||
|
|
||||||
if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
|
if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url:
|
||||||
logging.info("found youtube watch page %r", ytdlp_url)
|
logging.info("found youtube watch page %r", ytdlp_url)
|
||||||
@ -74,19 +74,23 @@ def should_ytdlp(worker, page, site):
|
|||||||
session = cluster.connect("video")
|
session = cluster.connect("video")
|
||||||
containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
|
containing_page_query = "SELECT * from videos where scope=%s and containing_page_url=%s LIMIT 1"
|
||||||
future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url])
|
future = session.execute_async(containing_page_query, [f"s:{ytdlp_seed}", ytdlp_url])
|
||||||
record = None
|
|
||||||
try:
|
try:
|
||||||
record = future.result()
|
rows = future.result()
|
||||||
logging.info("record: %s", record)
|
|
||||||
except ReadTimeout:
|
except ReadTimeout:
|
||||||
log.exception("Query timed out:")
|
logging.exception("Query timed out:")
|
||||||
if record and record.video_timestamp:
|
|
||||||
logging.info(f"video_timestamp: {record.video_timestamp}")
|
if len(rows.current_rows) == 0:
|
||||||
ytdlp_timestamp = datetime(*_timestamp4datetime(record.video_timestamp))
|
logging.info("no results returned from videos query")
|
||||||
|
return True
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
logging.info("video query found %r", row)
|
||||||
|
ytdlp_timestamp = datetime.datetime(*_timestamp4datetime(row.video_timestamp))
|
||||||
logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
|
logging.info("ytdlp_timestamp: %s", ytdlp_timestamp)
|
||||||
time_diff = datetime.now() - ytdlp_timestamp
|
time_diff = datetime.datetime.now() - ytdlp_timestamp
|
||||||
# TODO: make variable for timedelta
|
# TODO: make variable for timedelta
|
||||||
if time_diff < timedelta(days = 90):
|
if time_diff < datetime.timedelta(days = 90):
|
||||||
|
logging.info("skipping ytdlp for %s since there's a recent capture", row.containing_page_url)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
Loading…
x
Reference in New Issue
Block a user