diff --git a/brozzler/worker.py b/brozzler/worker.py index 69ac7d7..6614264 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -252,7 +252,7 @@ class BrozzlerWorker: except brozzler.PageInterstitialShown: self.logger.info("page interstitial shown (http auth): %s", page) - if enable_youtube_dl and ydl.should_ytdlp(page, site): + if enable_youtube_dl and ydl.should_ytdlp(self, page, site): try: ydl_outlinks = ydl.do_youtube_dl(self, site, page) except brozzler.ReachedLimit as e: diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 9201119..e3fa21a 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -62,9 +62,10 @@ def _timestamp4datetime(timestamp): int(timestamp[-2:]) ) -def should_ytdlp(page, site): +def should_ytdlp(worker, page, site): ytdlp_url = page.redirect_url if page.redirect_url else page.url - ytdlp_seed = site.get("warcprox-meta", {}).get("metadata", {}).get("ait_seed_id", "") + ytdlp_seed = site.seed_id if site.seed_id else None + # ytdlp_seed = site.get(site.id).pluck("metadata", "ait_seed_id").default(None) if site.rr else None ??? logging.info("ytdlp_seed: %s", ytdlp_seed) if ytdlp_seed and "youtube.com/watch?v" in ytdlp_url: diff --git a/setup.py b/setup.py index 1825f26..1c247c5 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def find_package_data(package): setuptools.setup( name="brozzler", - version="1.5.47a1", + version="1.5.47a2", description="Distributed web crawling with browsers", url="https://github.com/internetarchive/brozzler", author="Noah Levitt", @@ -67,6 +67,7 @@ setuptools.setup( install_requires=[ "PyYAML>=5.1", "yt_dlp<2023.11.16", + "cassandra-driver==3.29.1" "reppy==0.3.4", "requests>=2.21", "websocket-client>=0.39.0,<=0.48.0",