mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-21 13:14:49 -04:00
even more updates post-QA-deploy
This commit is contained in:
parent
3adae49be7
commit
f979bbf154
1 changed files with 44 additions and 21 deletions
|
@ -108,8 +108,10 @@ class VideoDataClient:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
|
def get_recent_video_capture(self, site=None, containing_page_url=None) -> List:
|
||||||
account_id = site.account_id if site.account_id else None
|
account_id = site["account_id"] if site["account_id"] else None
|
||||||
seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None
|
seed_id = (
|
||||||
|
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
||||||
|
)
|
||||||
|
|
||||||
if account_id and seed_id and containing_page_url:
|
if account_id and seed_id and containing_page_url:
|
||||||
# check for postgres query for most recent record
|
# check for postgres query for most recent record
|
||||||
|
@ -129,8 +131,10 @@ class VideoDataClient:
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def get_video_captures(self, site=None, source=None) -> List[str]:
|
def get_video_captures(self, site=None, source=None) -> List[str]:
|
||||||
account_id = site.account_id if site.account_id else None
|
account_id = site["account_id"] if site["account_id"] else None
|
||||||
seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None
|
seed_id = (
|
||||||
|
site["metadata"]["ait_seed_id"] if site["metadata"]["ait_seed_id"] else None
|
||||||
|
)
|
||||||
|
|
||||||
# TODO: generalize, maybe make variable?
|
# TODO: generalize, maybe make variable?
|
||||||
# containing_page_timestamp_pattern = "2025%" # for future pre-dup additions
|
# containing_page_timestamp_pattern = "2025%" # for future pre-dup additions
|
||||||
|
@ -465,7 +469,7 @@ def _remember_videos(page, pushed_videos=None):
|
||||||
video_record = worker._video_data.VideoCaptureRecord()
|
video_record = worker._video_data.VideoCaptureRecord()
|
||||||
video_record.crawl_job_id = site.job_id
|
video_record.crawl_job_id = site.job_id
|
||||||
video_record.is_test_crawl = True if warc_prefix_items[2] == "TEST" else False
|
video_record.is_test_crawl = True if warc_prefix_items[2] == "TEST" else False
|
||||||
video_record.seed_id = site.ait_seed_id
|
video_record.seed_id = site["metadata"]["ait_seed_id"]
|
||||||
video_record.collection_id = int(warc_prefix_items[1])
|
video_record.collection_id = int(warc_prefix_items[1])
|
||||||
video_record.containing_page_timestamp = None
|
video_record.containing_page_timestamp = None
|
||||||
video_record.containing_page_digest = None
|
video_record.containing_page_digest = None
|
||||||
|
@ -606,23 +610,42 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints):
|
||||||
or ie_result.get("extractor") == "youtube:tab"
|
or ie_result.get("extractor") == "youtube:tab"
|
||||||
):
|
):
|
||||||
if worker._video_data:
|
if worker._video_data:
|
||||||
captured_youtube_watch_pages = set()
|
logger.info(
|
||||||
captured_youtube_watch_pages.add(
|
"checking for previously captured youtube watch pages for account %s, seed_id %s",
|
||||||
worker._video_data.get_video_captures(site, source="youtube")
|
site["account_id"],
|
||||||
|
site["metadata"]["ait_seed_id"],
|
||||||
)
|
)
|
||||||
uncaptured_youtube_watch_pages = []
|
try:
|
||||||
for e in ie_result.get("entries_no_dl", []):
|
captured_youtube_watch_pages = set()
|
||||||
# note: http needed for match
|
captured_youtube_watch_pages.update(
|
||||||
youtube_watch_url = str(
|
worker._video_data.get_video_captures(site, source="youtube")
|
||||||
urlcanon.aggressive(f"http://www.youtube.com/watch?v={e['id']}")
|
|
||||||
)
|
)
|
||||||
if youtube_watch_url in captured_youtube_watch_pages:
|
uncaptured_youtube_watch_pages = []
|
||||||
logger.info("skipping adding %s to outlinks", youtube_watch_url)
|
for e in ie_result.get("entries_no_dl", []):
|
||||||
continue
|
# note: http needed for match
|
||||||
uncaptured_youtube_watch_pages.append(
|
youtube_watch_url = str(
|
||||||
f"https://www.youtube.com/watch?v={e['id']}"
|
urlcanon.aggressive(
|
||||||
)
|
f"http://www.youtube.com/watch?v={e['id']}"
|
||||||
if uncaptured_youtube_watch_pages:
|
)
|
||||||
outlinks.add(uncaptured_youtube_watch_pages)
|
)
|
||||||
|
if youtube_watch_url in captured_youtube_watch_pages:
|
||||||
|
logger.info(
|
||||||
|
"skipping adding %s to yt-dlp outlinks",
|
||||||
|
youtube_watch_url,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
uncaptured_youtube_watch_pages.append(
|
||||||
|
f"https://www.youtube.com/watch?v={e['id']}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("hit exception processing worker._video_data: %s", e)
|
||||||
|
if uncaptured_youtube_watch_pages:
|
||||||
|
outlinks.update(uncaptured_youtube_watch_pages)
|
||||||
|
else:
|
||||||
|
outlinks = {
|
||||||
|
"https://www.youtube.com/watch?v=%s" % e["id"]
|
||||||
|
for e in ie_result.get("entries_no_dl", [])
|
||||||
|
}
|
||||||
|
|
||||||
# todo: handle outlinks for instagram and soundcloud, other media source, here (if anywhere)
|
# todo: handle outlinks for instagram and soundcloud, other media source, here (if anywhere)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue