mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-07 22:12:15 -04:00
more updates post-QA-deploy
This commit is contained in:
parent
3d377e7943
commit
3adae49be7
1 changed files with 12 additions and 16 deletions
|
@ -114,11 +114,11 @@ class VideoDataClient:
|
||||||
if account_id and seed_id and containing_page_url:
|
if account_id and seed_id and containing_page_url:
|
||||||
# check for postgres query for most recent record
|
# check for postgres query for most recent record
|
||||||
pg_query = (
|
pg_query = (
|
||||||
"SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s LIMIT 1",
|
"SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s ORDER BY video_timestamp LIMIT 1",
|
||||||
(account_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
|
(account_id, seed_id, str(urlcanon.aggressive(containing_page_url))),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
results = self._execute_query(pg_query, fetchall=True)
|
results = self._execute_pg_query(pg_query, fetchall=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warn("postgres query failed: %s", e)
|
logger.warn("postgres query failed: %s", e)
|
||||||
results = []
|
results = []
|
||||||
|
@ -149,7 +149,7 @@ class VideoDataClient:
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
results = self._execute_query(
|
results = self._execute_pg_query(
|
||||||
pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True
|
pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -162,16 +162,15 @@ class VideoDataClient:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def create_video_capture_record(self, video_capture_record):
|
def create_video_capture_record(self, video_capture_record):
|
||||||
# note: brozzler postcrawl step 72 includes info from crawl-log — watch for differences!
|
# NOTE: we want to do this in brozzler postcrawl for now
|
||||||
|
# WIP
|
||||||
# TODO: needs added fields added to postgres table, refinement
|
# TODO: needs added fields added to postgres table, refinement
|
||||||
pg_query = (
|
pg_query = (
|
||||||
f"INSERT INTO video ({VideoCaptureRecord - items}) VALUES (%s, %s, ...)",
|
f"INSERT INTO video ({VideoCaptureRecord - items}) VALUES (%s, %s, ...)",
|
||||||
VideoCaptureRecord - values,
|
VideoCaptureRecord - values,
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
results = self._execute_query(
|
results = self._execute_pg_query(pg_query)
|
||||||
pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warn("postgres query failed: %s", e)
|
logger.warn("postgres query failed: %s", e)
|
||||||
results = []
|
results = []
|
||||||
|
@ -444,9 +443,10 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints):
|
||||||
return ydl
|
return ydl
|
||||||
|
|
||||||
|
|
||||||
def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None):
|
# new maybe? def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None):
|
||||||
|
def _remember_videos(page, pushed_videos=None):
|
||||||
"""
|
"""
|
||||||
Saves info about videos captured by yt-dlp in `page.videos` and postgres.
|
Saves info about videos captured by yt-dlp in `page.videos`
|
||||||
"""
|
"""
|
||||||
if "videos" not in page:
|
if "videos" not in page:
|
||||||
page.videos = []
|
page.videos = []
|
||||||
|
@ -480,13 +480,11 @@ def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None):
|
||||||
urlcanon.aggressive(ydl.url)
|
urlcanon.aggressive(ydl.url)
|
||||||
) # probably?
|
) # probably?
|
||||||
video_record.video_url = pushed_video["url"]
|
video_record.video_url = pushed_video["url"]
|
||||||
# note: ie_result may not be correct when multiple videos present
|
# note: NEW! ie_result may not be correct when multiple videos present
|
||||||
video_record.video_title = ie_result.get("title")
|
video_record.video_title = ie_result.get("title")
|
||||||
video_record.video_display_id = ie_result.get("display_id")
|
video_record.video_display_id = ie_result.get("display_id")
|
||||||
video_record.video_resolution = ie_result.get("resolution")
|
video_record.video_resolution = ie_result.get("resolution")
|
||||||
video_record.video_capture_status = None # "recrawl" maybe
|
video_record.video_capture_status = None # "recrawl" maybe
|
||||||
|
|
||||||
worker._video_data.save_video_capture_record(video_record)
|
|
||||||
"""
|
"""
|
||||||
logger.debug("embedded video", video=video)
|
logger.debug("embedded video", video=video)
|
||||||
page.videos.append(video)
|
page.videos.append(video)
|
||||||
|
@ -563,7 +561,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||||
logger.info("ytdlp completed successfully")
|
logger.info("ytdlp completed successfully")
|
||||||
|
|
||||||
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
info_json = json.dumps(ie_result, sort_keys=True, indent=4)
|
||||||
_remember_videos(page, info_json, ydl.pushed_videos)
|
_remember_videos(page, ydl.pushed_videos)
|
||||||
if worker._using_warcprox(site):
|
if worker._using_warcprox(site):
|
||||||
logger.info(
|
logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json",
|
"sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json",
|
||||||
|
@ -610,9 +608,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
|
||||||
if worker._video_data:
|
if worker._video_data:
|
||||||
captured_youtube_watch_pages = set()
|
captured_youtube_watch_pages = set()
|
||||||
captured_youtube_watch_pages.add(
|
captured_youtube_watch_pages.add(
|
||||||
worker._video_data.get_video_captures_by_source(
|
worker._video_data.get_video_captures(site, source="youtube")
|
||||||
site, source="youtube"
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
uncaptured_youtube_watch_pages = []
|
uncaptured_youtube_watch_pages = []
|
||||||
for e in ie_result.get("entries_no_dl", []):
|
for e in ie_result.get("entries_no_dl", []):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue