From 0227da6530b9dc211b6cf2733d2aa04618c08101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 11 Jun 2025 16:10:50 -0700 Subject: [PATCH 01/43] brozzler 1.7.0 --- pyproject.toml | 2 +- uv.lock | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 85b39bb..bb19770 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "brozzler" -version = "1.6.13" +version = "1.7.0" authors = [ { name="Noah Levitt", email="nlevitt@archive.org" }, ] diff --git a/uv.lock b/uv.lock index 43739de..5999da9 100644 --- a/uv.lock +++ b/uv.lock @@ -126,7 +126,7 @@ name = "brotlicffi" version = "1.1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cffi" }, + { name = "cffi", marker = "implementation_name != 'cpython'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/95/9d/70caa61192f570fcf0352766331b735afa931b4c6bc9a348a0925cc13288/brotlicffi-1.1.0.0.tar.gz", hash = "sha256:b77827a689905143f87915310b93b273ab17888fd43ef350d4832c4a71083c13", size = 465192, upload-time = "2023-09-14T14:22:40.707Z" } wheels = [ @@ -155,7 +155,7 @@ wheels = [ [[package]] name = "brozzler" -version = "1.6.13" +version = "1.7.0" source = { editable = "." } dependencies = [ { name = "cerberus" }, From 33f60ce60971f2f927187a0f78935b9c8e4c2e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 12 Jun 2025 12:47:04 -0700 Subject: [PATCH 02/43] Drop Python 3.8 support Python 3.8 is EOL since October. It's no longer supported by new versions of yt-dlp, limiting video capture support. It's also no longer supported by setuptools, which has complicated distribution - it's preventing us from keeping packaging configuration up to date. --- .github/workflows/python-formatting.yml | 4 +- .github/workflows/tests.yml | 2 +- README.rst | 2 +- pyproject.toml | 4 +- setup.py | 2 +- uv.lock | 94 +------------------------ 6 files changed, 9 insertions(+), 99 deletions(-) diff --git a/.github/workflows/python-formatting.yml b/.github/workflows/python-formatting.yml index 0ee1f92..3d58462 100644 --- a/.github/workflows/python-formatting.yml +++ b/.github/workflows/python-formatting.yml @@ -15,10 +15,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.9' - name: Create virtual environment run: python -m venv .venv diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fdafbd5..4cb2028 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - version: ['3.8', '3.12'] + version: ['3.9', '3.12'] steps: - uses: actions/checkout@v4 diff --git a/README.rst b/README.rst index eb3db89..c746633 100644 --- a/README.rst +++ b/README.rst @@ -17,7 +17,7 @@ Brozzler is designed to work in conjunction with `warcprox Requirements ------------ -- Python 3.8 or later +- Python 3.9 or later - RethinkDB deployment - Chromium or Google Chrome >= version 64 diff --git a/pyproject.toml b/pyproject.toml index bb19770..2ef3a64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ maintainers = [ ] description = "Distributed web crawling with browsers" readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", "Operating System :: OS Independent", @@ -31,7 +31,7 @@ dependencies = [ "prometheus-client>=0.20.0", "structlog>=25.1.0", ] -license = { text = "Apache-2.0" } +license = "Apache-2.0" [project.optional-dependencies] yt-dlp = ["yt-dlp[default,curl-cffi]>=2024.7.25"] diff --git a/setup.py b/setup.py index 47d69f6..bd1b04b 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ setuptools.setup( classifiers=[ "Development Status :: 5 - Production/Stable", "Environment :: Console", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Topic :: Internet :: WWW/HTTP", "Topic :: System :: Archiving", ], diff --git a/uv.lock b/uv.lock index 5999da9..f7c6d2b 100644 --- a/uv.lock +++ b/uv.lock @@ -1,6 +1,6 @@ version = 1 revision = 2 -requires-python = ">=3.8" +requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.12' and implementation_name == 'cpython' and os_name != 'nt'", "python_full_version >= '3.12' and implementation_name == 'cpython' and os_name == 'nt'", @@ -87,22 +87,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/af/85/a94e5cfaa0ca449d8f91c3d6f78313ebf919a0dbd55a100c711c6e9655bc/Brotli-1.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:832436e59afb93e1836081a20f324cb185836c617659b07b129141a8426973c7", size = 2930206, upload-time = "2024-10-18T12:32:51.198Z" }, { url = "https://files.pythonhosted.org/packages/c2/f0/a61d9262cd01351df22e57ad7c34f66794709acab13f34be2675f45bf89d/Brotli-1.1.0-cp313-cp313-win32.whl", hash = "sha256:43395e90523f9c23a3d5bdf004733246fba087f2948f87ab28015f12359ca6a0", size = 333804, upload-time = "2024-10-18T12:32:52.661Z" }, { url = "https://files.pythonhosted.org/packages/7e/c1/ec214e9c94000d1c1974ec67ced1c970c148aa6b8d8373066123fc3dbf06/Brotli-1.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:9011560a466d2eb3f5a6e4929cf4a09be405c64154e12df0dd72713f6500e32b", size = 358517, upload-time = "2024-10-18T12:32:54.066Z" }, - { url = "https://files.pythonhosted.org/packages/34/1b/16114a20c0a43c20331f03431178ed8b12280b12c531a14186da0bc5b276/Brotli-1.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:efa8b278894b14d6da122a72fefcebc28445f2d3f880ac59d46c90f4c13be9a3", size = 873053, upload-time = "2023-09-07T14:04:58.335Z" }, - { url = "https://files.pythonhosted.org/packages/36/49/2afe4aa5a23a13dad4c7160ae574668eec58b3c80b56b74a826cebff7ab8/Brotli-1.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:03d20af184290887bdea3f0f78c4f737d126c74dc2f3ccadf07e54ceca3bf208", size = 446211, upload-time = "2023-09-07T14:04:59.928Z" }, - { url = "https://files.pythonhosted.org/packages/10/9d/6463edb80a9e0a944f70ed0c4d41330178526626d7824f729e81f78a3f24/Brotli-1.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6172447e1b368dcbc458925e5ddaf9113477b0ed542df258d84fa28fc45ceea7", size = 2904604, upload-time = "2023-09-07T14:05:02.348Z" }, - { url = "https://files.pythonhosted.org/packages/a4/bd/cfaac88c14f97d9e1f2e51a304c3573858548bb923d011b19f76b295f81c/Brotli-1.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a743e5a28af5f70f9c080380a5f908d4d21d40e8f0e0c8901604d15cfa9ba751", size = 2941707, upload-time = "2023-09-07T14:05:04.639Z" }, - { url = "https://files.pythonhosted.org/packages/60/3f/2618fa887d7af6828246822f10d9927244dab22db7a96ec56041a2fd1fbd/Brotli-1.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0541e747cce78e24ea12d69176f6a7ddb690e62c425e01d31cc065e69ce55b48", size = 2672420, upload-time = "2023-09-07T14:05:06.709Z" }, - { url = "https://files.pythonhosted.org/packages/e7/41/1c6d15c8d5b55db2c3c249c64c352c8a1bc97f5e5c55183f5930866fc012/Brotli-1.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cdbc1fc1bc0bff1cef838eafe581b55bfbffaed4ed0318b724d0b71d4d377619", size = 2757410, upload-time = "2023-09-07T14:05:09.28Z" }, - { url = "https://files.pythonhosted.org/packages/6c/5b/ca72fd8aa1278dfbb12eb320b6e409aefabcd767b85d607c9d54c9dadd1a/Brotli-1.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:890b5a14ce214389b2cc36ce82f3093f96f4cc730c1cffdbefff77a7c71f2a97", size = 2911143, upload-time = "2023-09-07T14:05:11.737Z" }, - { url = "https://files.pythonhosted.org/packages/b1/53/110657f4017d34a2e9a96d9630a388ad7e56092023f1d46d11648c6c0bce/Brotli-1.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ab4fbee0b2d9098c74f3057b2bc055a8bd92ccf02f65944a241b4349229185a", size = 2809968, upload-time = "2023-09-07T14:05:13.351Z" }, - { url = "https://files.pythonhosted.org/packages/3f/2a/fbc95429b45e4aa4a3a3a815e4af11772bfd8ef94e883dcff9ceaf556662/Brotli-1.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:141bd4d93984070e097521ed07e2575b46f817d08f9fa42b16b9b5f27b5ac088", size = 2935402, upload-time = "2023-09-07T14:05:15.039Z" }, - { url = "https://files.pythonhosted.org/packages/4e/52/02acd2992e5a2c10adf65fa920fad0c29e11e110f95eeb11bcb20342ecd2/Brotli-1.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fce1473f3ccc4187f75b4690cfc922628aed4d3dd013d047f95a9b3919a86596", size = 2931208, upload-time = "2023-09-07T14:05:16.747Z" }, - { url = "https://files.pythonhosted.org/packages/6b/35/5d258d1aeb407e1fc6fcbbff463af9c64d1ecc17042625f703a1e9d22ec5/Brotli-1.1.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d2b35ca2c7f81d173d2fadc2f4f31e88cc5f7a39ae5b6db5513cf3383b0e0ec7", size = 2933171, upload-time = "2024-10-18T12:33:10.342Z" }, - { url = "https://files.pythonhosted.org/packages/cc/58/b25ca26492da9880e517753967685903c6002ddc2aade93d6e56df817b30/Brotli-1.1.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:af6fa6817889314555aede9a919612b23739395ce767fe7fcbea9a80bf140fe5", size = 2845347, upload-time = "2024-10-18T12:33:12.367Z" }, - { url = "https://files.pythonhosted.org/packages/12/cf/91b84beaa051c9376a22cc38122dc6fbb63abcebd5a4b8503e9c388de7b1/Brotli-1.1.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:2feb1d960f760a575dbc5ab3b1c00504b24caaf6986e2dc2b01c09c87866a943", size = 3031668, upload-time = "2024-10-18T12:33:14.347Z" }, - { url = "https://files.pythonhosted.org/packages/38/05/04a57ba75aed972be0c6ad5f2f5ea34c83f5fecf57787cc6e54aac21a323/Brotli-1.1.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4410f84b33374409552ac9b6903507cdb31cd30d2501fc5ca13d18f73548444a", size = 2926949, upload-time = "2024-10-18T12:33:15.988Z" }, - { url = "https://files.pythonhosted.org/packages/c9/2f/fbe6938f33d2cd9b7d7fb591991eb3fb57ffa40416bb873bbbacab60a381/Brotli-1.1.0-cp38-cp38-win32.whl", hash = "sha256:db85ecf4e609a48f4b29055f1e144231b90edc90af7481aa731ba2d059226b1b", size = 333179, upload-time = "2023-09-07T14:05:18.343Z" }, - { url = "https://files.pythonhosted.org/packages/39/a5/9322c8436072e77b8646f6bde5e19ee66f62acf7aa01337ded10777077fa/Brotli-1.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3d7954194c36e304e1523f55d7042c59dc53ec20dd4e9ea9d151f1b62b4415c0", size = 357254, upload-time = "2023-09-07T14:05:19.792Z" }, { url = "https://files.pythonhosted.org/packages/1b/aa/aa6e0c9848ee4375514af0b27abf470904992939b7363ae78fc8aca8a9a8/Brotli-1.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5fb2ce4b8045c78ebbc7b8f3c15062e435d47e7393cc57c25115cfd49883747a", size = 873048, upload-time = "2023-09-07T14:05:21.205Z" }, { url = "https://files.pythonhosted.org/packages/ae/32/38bba1a8bef9ecb1cda08439fd28d7e9c51aff13b4783a4f1610da90b6c2/Brotli-1.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7905193081db9bfa73b1219140b3d315831cbff0d8941f22da695832f0dd188f", size = 446207, upload-time = "2023-09-07T14:05:23.21Z" }, { url = "https://files.pythonhosted.org/packages/3c/6a/14cc20ddc53efc274601c8195791a27cfb7acc5e5134e0f8c493a8b8821a/Brotli-1.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a77def80806c421b4b0af06f45d65a136e7ac0bdca3c09d9e2ea4e515367c7e9", size = 2903803, upload-time = "2023-09-07T14:05:24.864Z" }, @@ -126,7 +110,7 @@ name = "brotlicffi" version = "1.1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cffi", marker = "implementation_name != 'cpython'" }, + { name = "cffi" }, ] sdist = { url = "https://files.pythonhosted.org/packages/95/9d/70caa61192f570fcf0352766331b735afa931b4c6bc9a348a0925cc13288/brotlicffi-1.1.0.0.tar.gz", hash = "sha256:b77827a689905143f87915310b93b273ab17888fd43ef350d4832c4a71083c13", size = 465192, upload-time = "2023-09-14T14:22:40.707Z" } wheels = [ @@ -141,11 +125,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/95/15aa422aa6450e6556e54a5fd1650ff59f470aed77ac739aa90ab63dc611/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54a07bb2374a1eba8ebb52b6fafffa2afd3c4df85ddd38fcc0511f2bb387c2a8", size = 378635, upload-time = "2023-09-14T14:22:11.982Z" }, { url = "https://files.pythonhosted.org/packages/6c/a7/f254e13b2cb43337d6d99a4ec10394c134e41bfda8a2eff15b75627f4a3d/brotlicffi-1.1.0.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7901a7dc4b88f1c1475de59ae9be59799db1007b7d059817948d8e4f12e24e35", size = 385719, upload-time = "2023-09-14T14:22:13.483Z" }, { url = "https://files.pythonhosted.org/packages/72/a9/0971251c4427c14b2a827dba3d910d4d3330dabf23d4278bf6d06a978847/brotlicffi-1.1.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce01c7316aebc7fce59da734286148b1d1b9455f89cf2c8a4dfce7d41db55c2d", size = 361760, upload-time = "2023-09-14T14:22:14.767Z" }, - { url = "https://files.pythonhosted.org/packages/75/ff/e227f8547f5ef11d861abae091d5dc012c2b1eb2e7358eff429fafbd608e/brotlicffi-1.1.0.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9b6068e0f3769992d6b622a1cd2e7835eae3cf8d9da123d7f51ca9c1e9c333e5", size = 397391, upload-time = "2023-09-14T14:22:23.595Z" }, - { url = "https://files.pythonhosted.org/packages/85/2d/9e8057f9c73c29090ce885fe2a133c17082ce2aa0712c533a52a5aeb042f/brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8557a8559509b61e65083f8782329188a250102372576093c88930c875a69838", size = 379693, upload-time = "2023-09-14T14:22:25.618Z" }, - { url = "https://files.pythonhosted.org/packages/50/22/62b4bf874a0be46e79bb46db4e52533f757d85107ee0cdfcc800314e865f/brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a7ae37e5d79c5bdfb5b4b99f2715a6035e6c5bf538c3746abc8e26694f92f33", size = 378627, upload-time = "2023-09-14T14:22:27.527Z" }, - { url = "https://files.pythonhosted.org/packages/ff/cb/648a47cd457a3afe3bacdfcd62e89fde6666be503d06403a6c2f157b7d61/brotlicffi-1.1.0.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391151ec86bb1c683835980f4816272a87eaddc46bb91cbf44f62228b84d8cca", size = 385712, upload-time = "2023-09-14T14:22:28.835Z" }, - { url = "https://files.pythonhosted.org/packages/4b/df/d81660ba62bb54cefd6e95d5315710a8871ebf0872a4bd61b13388181742/brotlicffi-1.1.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2f3711be9290f0453de8eed5275d93d286abe26b08ab4a35d7452caa1fef532f", size = 361750, upload-time = "2023-09-14T14:22:30.772Z" }, { url = "https://files.pythonhosted.org/packages/35/9b/e0b577351e1d9d5890e1a56900c4ceaaef783b807145cd229446a43cf437/brotlicffi-1.1.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a807d760763e398bbf2c6394ae9da5815901aa93ee0a37bca5efe78d4ee3171", size = 397392, upload-time = "2023-09-14T14:22:32.2Z" }, { url = "https://files.pythonhosted.org/packages/4f/7f/a16534d28386f74781db8b4544a764cf955abae336379a76f50e745bb0ee/brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa8ca0623b26c94fccc3a1fdd895be1743b838f3917300506d04aa3346fd2a14", size = 379695, upload-time = "2023-09-14T14:22:33.85Z" }, { url = "https://files.pythonhosted.org/packages/50/2a/699388b5e489726991132441b55aff0691dd73c49105ef220408a5ab98d6/brotlicffi-1.1.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3de0cf28a53a3238b252aca9fed1593e9d36c1d116748013339f0949bfc84112", size = 378629, upload-time = "2023-09-14T14:22:35.9Z" }, @@ -307,14 +286,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/47/d7145bf2dc04684935d57d67dff9d6d795b2ba2796806bb109864be3a151/cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9", size = 488469, upload-time = "2024-09-04T20:44:41.616Z" }, { url = "https://files.pythonhosted.org/packages/bf/ee/f94057fa6426481d663b88637a9a10e859e492c73d0384514a17d78ee205/cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d", size = 172475, upload-time = "2024-09-04T20:44:43.733Z" }, { url = "https://files.pythonhosted.org/packages/7c/fc/6a8cb64e5f0324877d503c854da15d76c1e50eb722e320b15345c4d0c6de/cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a", size = 182009, upload-time = "2024-09-04T20:44:45.309Z" }, - { url = "https://files.pythonhosted.org/packages/48/08/15bf6b43ae9bd06f6b00ad8a91f5a8fe1069d4c9fab550a866755402724e/cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b", size = 182457, upload-time = "2024-09-04T20:44:47.892Z" }, - { url = "https://files.pythonhosted.org/packages/c2/5b/f1523dd545f92f7df468e5f653ffa4df30ac222f3c884e51e139878f1cb5/cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964", size = 425932, upload-time = "2024-09-04T20:44:49.491Z" }, - { url = "https://files.pythonhosted.org/packages/53/93/7e547ab4105969cc8c93b38a667b82a835dd2cc78f3a7dad6130cfd41e1d/cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9", size = 448585, upload-time = "2024-09-04T20:44:51.671Z" }, - { url = "https://files.pythonhosted.org/packages/56/c4/a308f2c332006206bb511de219efeff090e9d63529ba0a77aae72e82248b/cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc", size = 456268, upload-time = "2024-09-04T20:44:53.51Z" }, - { url = "https://files.pythonhosted.org/packages/ca/5b/b63681518265f2f4060d2b60755c1c77ec89e5e045fc3773b72735ddaad5/cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c", size = 436592, upload-time = "2024-09-04T20:44:55.085Z" }, - { url = "https://files.pythonhosted.org/packages/bb/19/b51af9f4a4faa4a8ac5a0e5d5c2522dcd9703d07fac69da34a36c4d960d3/cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1", size = 446512, upload-time = "2024-09-04T20:44:57.135Z" }, - { url = "https://files.pythonhosted.org/packages/e2/63/2bed8323890cb613bbecda807688a31ed11a7fe7afe31f8faaae0206a9a3/cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8", size = 171576, upload-time = "2024-09-04T20:44:58.535Z" }, - { url = "https://files.pythonhosted.org/packages/2f/70/80c33b044ebc79527447fd4fbc5455d514c3bb840dede4455de97da39b4d/cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1", size = 181229, upload-time = "2024-09-04T20:44:59.963Z" }, { url = "https://files.pythonhosted.org/packages/b9/ea/8bb50596b8ffbc49ddd7a1ad305035daa770202a6b782fc164647c2673ad/cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16", size = 182220, upload-time = "2024-09-04T20:45:01.577Z" }, { url = "https://files.pythonhosted.org/packages/ae/11/e77c8cd24f58285a82c23af484cf5b124a376b32644e445960d1a4654c3a/cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36", size = 178605, upload-time = "2024-09-04T20:45:03.837Z" }, { url = "https://files.pythonhosted.org/packages/ed/65/25a8dc32c53bf5b7b6c2686b42ae2ad58743f7ff644844af7cdb29b49361/cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8", size = 424910, upload-time = "2024-09-04T20:45:05.315Z" }, @@ -387,19 +358,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e4/93/946a86ce20790e11312c87c75ba68d5f6ad2208cfb52b2d6a2c32840d922/charset_normalizer-3.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd", size = 145732, upload-time = "2024-12-24T18:11:22.774Z" }, { url = "https://files.pythonhosted.org/packages/cd/e5/131d2fb1b0dddafc37be4f3a2fa79aa4c037368be9423061dccadfd90091/charset_normalizer-3.4.1-cp313-cp313-win32.whl", hash = "sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407", size = 95391, upload-time = "2024-12-24T18:11:24.139Z" }, { url = "https://files.pythonhosted.org/packages/27/f2/4f9a69cc7712b9b5ad8fdb87039fd89abba997ad5cbe690d1835d40405b0/charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971", size = 102702, upload-time = "2024-12-24T18:11:26.535Z" }, - { url = "https://files.pythonhosted.org/packages/10/bd/6517ea94f2672e801011d50b5d06be2a0deaf566aea27bcdcd47e5195357/charset_normalizer-3.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c", size = 195653, upload-time = "2024-12-24T18:11:45.568Z" }, - { url = "https://files.pythonhosted.org/packages/e5/0d/815a2ba3f283b4eeaa5ece57acade365c5b4135f65a807a083c818716582/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9", size = 140701, upload-time = "2024-12-24T18:11:46.968Z" }, - { url = "https://files.pythonhosted.org/packages/aa/17/c94be7ee0d142687e047fe1de72060f6d6837f40eedc26e87e6e124a3fc6/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8", size = 150495, upload-time = "2024-12-24T18:11:48.375Z" }, - { url = "https://files.pythonhosted.org/packages/f7/33/557ac796c47165fc141e4fb71d7b0310f67e05cb420756f3a82e0a0068e0/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6", size = 142946, upload-time = "2024-12-24T18:11:53.619Z" }, - { url = "https://files.pythonhosted.org/packages/1e/0d/38ef4ae41e9248d63fc4998d933cae22473b1b2ac4122cf908d0f5eb32aa/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c", size = 144737, upload-time = "2024-12-24T18:11:54.993Z" }, - { url = "https://files.pythonhosted.org/packages/43/01/754cdb29dd0560f58290aaaa284d43eea343ad0512e6ad3b8b5c11f08592/charset_normalizer-3.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a", size = 147471, upload-time = "2024-12-24T18:11:58.169Z" }, - { url = "https://files.pythonhosted.org/packages/ba/cd/861883ba5160c7a9bd242c30b2c71074cda2aefcc0addc91118e0d4e0765/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd", size = 140801, upload-time = "2024-12-24T18:12:01.02Z" }, - { url = "https://files.pythonhosted.org/packages/6f/7f/0c0dad447819e90b93f8ed238cc8f11b91353c23c19e70fa80483a155bed/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd", size = 149312, upload-time = "2024-12-24T18:12:02.267Z" }, - { url = "https://files.pythonhosted.org/packages/8e/09/9f8abcc6fff60fb727268b63c376c8c79cc37b833c2dfe1f535dfb59523b/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824", size = 152347, upload-time = "2024-12-24T18:12:04.145Z" }, - { url = "https://files.pythonhosted.org/packages/be/e5/3f363dad2e24378f88ccf63ecc39e817c29f32e308ef21a7a6d9c1201165/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca", size = 149888, upload-time = "2024-12-24T18:12:05.673Z" }, - { url = "https://files.pythonhosted.org/packages/e4/10/a78c0e91f487b4ad0ef7480ac765e15b774f83de2597f1b6ef0eaf7a2f99/charset_normalizer-3.4.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b", size = 145169, upload-time = "2024-12-24T18:12:06.846Z" }, - { url = "https://files.pythonhosted.org/packages/d3/81/396e7d7f5d7420da8273c91175d2e9a3f569288e3611d521685e4b9ac9cc/charset_normalizer-3.4.1-cp38-cp38-win32.whl", hash = "sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e", size = 95094, upload-time = "2024-12-24T18:12:08.048Z" }, - { url = "https://files.pythonhosted.org/packages/40/bb/20affbbd9ea29c71ea123769dc568a6d42052ff5089c5fe23e21e21084a6/charset_normalizer-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4", size = 102139, upload-time = "2024-12-24T18:12:09.161Z" }, { url = "https://files.pythonhosted.org/packages/7f/c0/b913f8f02836ed9ab32ea643c6fe4d3325c3d8627cf6e78098671cafff86/charset_normalizer-3.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41", size = 197867, upload-time = "2024-12-24T18:12:10.438Z" }, { url = "https://files.pythonhosted.org/packages/0f/6c/2bee440303d705b6fb1e2ec789543edec83d32d258299b16eed28aad48e0/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f", size = 141385, upload-time = "2024-12-24T18:12:11.847Z" }, { url = "https://files.pythonhosted.org/packages/3d/04/cb42585f07f6f9fd3219ffb6f37d5a39b4fd2db2355b23683060029c35f7/charset_normalizer-3.4.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2", size = 151367, upload-time = "2024-12-24T18:12:13.177Z" }, @@ -458,10 +416,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e8/5c/9e47aac90fb5923d09c413909af6bf6ad4af2bfeeff707a2485c3f2af8be/cryptography-39.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d7d84a512a59f4412ca8549b01f94be4161c94efc598bf09d027d67826beddc0", size = 4264946, upload-time = "2023-03-02T21:08:03.191Z" }, { url = "https://files.pythonhosted.org/packages/0b/20/f406c0d2edb7b3974c39099816795d629bbd8716cd41cda8c3c4703de095/cryptography-39.0.2-cp36-abi3-win32.whl", hash = "sha256:c43ac224aabcbf83a947eeb8b17eaf1547bce3767ee2d70093b461f31729a480", size = 2086555, upload-time = "2023-03-02T21:08:06.829Z" }, { url = "https://files.pythonhosted.org/packages/4f/0e/55b8cff87b572da828e9c6b7e7c5ecb9dc955b551ab863c28464a15f6451/cryptography-39.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:788b3921d763ee35dfdb04248d0e3de11e3ca8eb22e2e48fef880c42e1f3c8f9", size = 2454252, upload-time = "2023-03-02T21:08:11.324Z" }, - { url = "https://files.pythonhosted.org/packages/6d/5b/516dc11fa0a638cb707293ad44cc1cb93924bb4b5ba03881dfdb819e23b0/cryptography-39.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:d15809e0dbdad486f4ad0979753518f47980020b7a34e9fc56e8be4f60702fac", size = 2739328, upload-time = "2023-03-02T21:06:47.793Z" }, - { url = "https://files.pythonhosted.org/packages/77/19/47d55b3f609fc03b6f80c63820996671dfccb28e1d07427dd81319d514d5/cryptography-39.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:50cadb9b2f961757e712a9737ef33d89b8190c3ea34d0fb6675e00edbe35d074", size = 3285152, upload-time = "2023-03-02T21:28:00.219Z" }, - { url = "https://files.pythonhosted.org/packages/1e/85/d5b768b45e564a66fc5ba6344145334208f01d64939adcb8c4032545d164/cryptography-39.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:103e8f7155f3ce2ffa0049fe60169878d47a4364b277906386f8de21c9234aa1", size = 3468218, upload-time = "2023-03-02T21:28:07.497Z" }, - { url = "https://files.pythonhosted.org/packages/9e/a0/4c0c8b827f430246b48a0f2415a432427d365c77b04a911c5139ae9c79b1/cryptography-39.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6236a9610c912b129610eb1a274bdc1350b5df834d124fa84729ebeaf7da42c3", size = 2359592, upload-time = "2023-03-02T21:08:16.067Z" }, { url = "https://files.pythonhosted.org/packages/06/ea/d998c35ac871396749f14a4b4e82f080422182e1830f02959e893abd3016/cryptography-39.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e944fe07b6f229f4c1a06a7ef906a19652bdd9fd54c761b0ff87e83ae7a30354", size = 2739320, upload-time = "2023-03-02T21:06:53.936Z" }, { url = "https://files.pythonhosted.org/packages/c6/c4/354c78bade0270f951a14db9f8248ab975ca7df050476dc3759831e52a7f/cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:35d658536b0a4117c885728d1a7032bdc9a5974722ae298d6c533755a6ee3915", size = 3130600, upload-time = "2023-03-02T21:07:06.275Z" }, { url = "https://files.pythonhosted.org/packages/13/40/acfea5abe60f483bfba44a24419fa89d584e1e93dca750b800805ef272b9/cryptography-39.0.2-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:30b1d1bfd00f6fc80d11300a29f1d8ab2b8d9febb6ed4a38a76880ec564fae84", size = 3523507, upload-time = "2023-03-02T21:07:16.904Z" }, @@ -646,16 +600,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/07/2dc76aa51b481eb96a4c3198894f38b480490e834479611a4053fbf08623/MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169", size = 33038, upload-time = "2024-02-02T16:30:42.243Z" }, { url = "https://files.pythonhosted.org/packages/96/0c/620c1fb3661858c0e37eb3cbffd8c6f732a67cd97296f725789679801b31/MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad", size = 16572, upload-time = "2024-02-02T16:30:43.326Z" }, { url = "https://files.pythonhosted.org/packages/3f/14/c3554d512d5f9100a95e737502f4a2323a1959f6d0d01e0d0997b35f7b10/MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb", size = 17127, upload-time = "2024-02-02T16:30:44.418Z" }, - { url = "https://files.pythonhosted.org/packages/f8/ff/2c942a82c35a49df5de3a630ce0a8456ac2969691b230e530ac12314364c/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a", size = 18192, upload-time = "2024-02-02T16:30:57.715Z" }, - { url = "https://files.pythonhosted.org/packages/4f/14/6f294b9c4f969d0c801a4615e221c1e084722ea6114ab2114189c5b8cbe0/MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46", size = 14072, upload-time = "2024-02-02T16:30:58.844Z" }, - { url = "https://files.pythonhosted.org/packages/81/d4/fd74714ed30a1dedd0b82427c02fa4deec64f173831ec716da11c51a50aa/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532", size = 26928, upload-time = "2024-02-02T16:30:59.922Z" }, - { url = "https://files.pythonhosted.org/packages/c7/bd/50319665ce81bb10e90d1cf76f9e1aa269ea6f7fa30ab4521f14d122a3df/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab", size = 26106, upload-time = "2024-02-02T16:31:01.582Z" }, - { url = "https://files.pythonhosted.org/packages/4c/6f/f2b0f675635b05f6afd5ea03c094557bdb8622fa8e673387444fe8d8e787/MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68", size = 25781, upload-time = "2024-02-02T16:31:02.71Z" }, - { url = "https://files.pythonhosted.org/packages/51/e0/393467cf899b34a9d3678e78961c2c8cdf49fb902a959ba54ece01273fb1/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0", size = 30518, upload-time = "2024-02-02T16:31:04.392Z" }, - { url = "https://files.pythonhosted.org/packages/f6/02/5437e2ad33047290dafced9df741d9efc3e716b75583bbd73a9984f1b6f7/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4", size = 29669, upload-time = "2024-02-02T16:31:05.53Z" }, - { url = "https://files.pythonhosted.org/packages/0e/7d/968284145ffd9d726183ed6237c77938c021abacde4e073020f920e060b2/MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3", size = 29933, upload-time = "2024-02-02T16:31:06.636Z" }, - { url = "https://files.pythonhosted.org/packages/bf/f3/ecb00fc8ab02b7beae8699f34db9357ae49d9f21d4d3de6f305f34fa949e/MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff", size = 16656, upload-time = "2024-02-02T16:31:07.767Z" }, - { url = "https://files.pythonhosted.org/packages/92/21/357205f03514a49b293e214ac39de01fadd0970a6e05e4bf1ddd0ffd0881/MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029", size = 17206, upload-time = "2024-02-02T16:31:08.843Z" }, { url = "https://files.pythonhosted.org/packages/0f/31/780bb297db036ba7b7bbede5e1d7f1e14d704ad4beb3ce53fb495d22bc62/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf", size = 18193, upload-time = "2024-02-02T16:31:10.155Z" }, { url = "https://files.pythonhosted.org/packages/6c/77/d77701bbef72892affe060cdacb7a2ed7fd68dae3b477a8642f15ad3b132/MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2", size = 14073, upload-time = "2024-02-02T16:31:11.442Z" }, { url = "https://files.pythonhosted.org/packages/d9/a7/1e558b4f78454c8a3a0199292d96159eb4d091f983bc35ef258314fe7269/MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8", size = 26486, upload-time = "2024-02-02T16:31:12.488Z" }, @@ -736,16 +680,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/f9/cfaa5082ca9bc4a6de66ffe1c12c2d90bf09c309a5f52b27759a596900e7/pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060", size = 2235603, upload-time = "2024-07-01T09:47:03.918Z" }, { url = "https://files.pythonhosted.org/packages/01/6a/30ff0eef6e0c0e71e55ded56a38d4859bf9d3634a94a88743897b5f96936/pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea", size = 2554972, upload-time = "2024-07-01T09:47:06.152Z" }, { url = "https://files.pythonhosted.org/packages/48/2c/2e0a52890f269435eee38b21c8218e102c621fe8d8df8b9dd06fabf879ba/pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d", size = 2243375, upload-time = "2024-07-01T09:47:09.065Z" }, - { url = "https://files.pythonhosted.org/packages/56/70/f40009702a477ce87d8d9faaa4de51d6562b3445d7a314accd06e4ffb01d/pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736", size = 3509213, upload-time = "2024-07-01T09:47:11.662Z" }, - { url = "https://files.pythonhosted.org/packages/10/43/105823d233c5e5d31cea13428f4474ded9d961652307800979a59d6a4276/pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b", size = 3375883, upload-time = "2024-07-01T09:47:14.453Z" }, - { url = "https://files.pythonhosted.org/packages/3c/ad/7850c10bac468a20c918f6a5dbba9ecd106ea1cdc5db3c35e33a60570408/pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2", size = 4330810, upload-time = "2024-07-01T09:47:16.695Z" }, - { url = "https://files.pythonhosted.org/packages/84/4c/69bbed9e436ac22f9ed193a2b64f64d68fcfbc9f4106249dc7ed4889907b/pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680", size = 4444341, upload-time = "2024-07-01T09:47:19.334Z" }, - { url = "https://files.pythonhosted.org/packages/8f/4f/c183c63828a3f37bf09644ce94cbf72d4929b033b109160a5379c2885932/pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b", size = 4356005, upload-time = "2024-07-01T09:47:21.805Z" }, - { url = "https://files.pythonhosted.org/packages/fb/ad/435fe29865f98a8fbdc64add8875a6e4f8c97749a93577a8919ec6f32c64/pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd", size = 4525201, upload-time = "2024-07-01T09:47:24.457Z" }, - { url = "https://files.pythonhosted.org/packages/80/74/be8bf8acdfd70e91f905a12ae13cfb2e17c0f1da745c40141e26d0971ff5/pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84", size = 4460635, upload-time = "2024-07-01T09:47:26.841Z" }, - { url = "https://files.pythonhosted.org/packages/e4/90/763616e66dc9ad59c9b7fb58f863755e7934ef122e52349f62c7742b82d3/pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0", size = 4590283, upload-time = "2024-07-01T09:47:29.247Z" }, - { url = "https://files.pythonhosted.org/packages/69/66/03002cb5b2c27bb519cba63b9f9aa3709c6f7a5d3b285406c01f03fb77e5/pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e", size = 2235185, upload-time = "2024-07-01T09:47:32.205Z" }, - { url = "https://files.pythonhosted.org/packages/f2/75/3cb820b2812405fc7feb3d0deb701ef0c3de93dc02597115e00704591bc9/pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab", size = 2554594, upload-time = "2024-07-01T09:47:34.285Z" }, { url = "https://files.pythonhosted.org/packages/31/85/955fa5400fa8039921f630372cfe5056eed6e1b8e0430ee4507d7de48832/pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d", size = 3509283, upload-time = "2024-07-01T09:47:36.394Z" }, { url = "https://files.pythonhosted.org/packages/23/9c/343827267eb28d41cd82b4180d33b10d868af9077abcec0af9793aa77d2d/pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b", size = 3375691, upload-time = "2024-07-01T09:47:38.853Z" }, { url = "https://files.pythonhosted.org/packages/60/a3/7ebbeabcd341eab722896d1a5b59a3df98c4b4d26cf4b0385f8aa94296f7/pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd", size = 4328295, upload-time = "2024-07-01T09:47:41.765Z" }, @@ -921,13 +855,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fe/0f/25911a9f080464c59fab9027482f822b86bf0608957a5fcc6eaac85aa515/PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652", size = 751597, upload-time = "2024-08-06T20:32:56.985Z" }, { url = "https://files.pythonhosted.org/packages/14/0d/e2c3b43bbce3cf6bd97c840b46088a3031085179e596d4929729d8d68270/PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183", size = 140527, upload-time = "2024-08-06T20:33:03.001Z" }, { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, - { url = "https://files.pythonhosted.org/packages/74/d9/323a59d506f12f498c2097488d80d16f4cf965cee1791eab58b56b19f47a/PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a", size = 183218, upload-time = "2024-08-06T20:33:06.411Z" }, - { url = "https://files.pythonhosted.org/packages/74/cc/20c34d00f04d785f2028737e2e2a8254e1425102e730fee1d6396f832577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5", size = 728067, upload-time = "2024-08-06T20:33:07.879Z" }, - { url = "https://files.pythonhosted.org/packages/20/52/551c69ca1501d21c0de51ddafa8c23a0191ef296ff098e98358f69080577/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d", size = 757812, upload-time = "2024-08-06T20:33:12.542Z" }, - { url = "https://files.pythonhosted.org/packages/fd/7f/2c3697bba5d4aa5cc2afe81826d73dfae5f049458e44732c7a0938baa673/PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083", size = 746531, upload-time = "2024-08-06T20:33:14.391Z" }, - { url = "https://files.pythonhosted.org/packages/8c/ab/6226d3df99900e580091bb44258fde77a8433511a86883bd4681ea19a858/PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706", size = 800820, upload-time = "2024-08-06T20:33:16.586Z" }, - { url = "https://files.pythonhosted.org/packages/a0/99/a9eb0f3e710c06c5d922026f6736e920d431812ace24aae38228d0d64b04/PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a", size = 145514, upload-time = "2024-08-06T20:33:22.414Z" }, - { url = "https://files.pythonhosted.org/packages/75/8a/ee831ad5fafa4431099aa4e078d4c8efd43cd5e48fbc774641d233b683a9/PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff", size = 162702, upload-time = "2024-08-06T20:33:23.813Z" }, { url = "https://files.pythonhosted.org/packages/65/d8/b7a1db13636d7fb7d4ff431593c510c8b8fca920ade06ca8ef20015493c5/PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d", size = 184777, upload-time = "2024-08-06T20:33:25.896Z" }, { url = "https://files.pythonhosted.org/packages/0a/02/6ec546cd45143fdf9840b2c6be8d875116a64076218b61d68e12548e5839/PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f", size = 172318, upload-time = "2024-08-06T20:33:27.212Z" }, { url = "https://files.pythonhosted.org/packages/0e/9a/8cc68be846c972bda34f6c2a93abb644fb2476f4dcc924d52175786932c9/PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290", size = 720891, upload-time = "2024-08-06T20:33:28.974Z" }, @@ -1203,17 +1130,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a1/6e/66b6b756aebbd680b934c8bdbb6dcb9ce45aad72cde5f8a7208dbb00dd36/websockets-13.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:70c5be9f416aa72aab7a2a76c90ae0a4fe2755c1816c153c1a2bcc3333ce4ce6", size = 164732, upload-time = "2024-09-21T17:33:23.103Z" }, { url = "https://files.pythonhosted.org/packages/35/c6/12e3aab52c11aeb289e3dbbc05929e7a9d90d7a9173958477d3ef4f8ce2d/websockets-13.1-cp313-cp313-win32.whl", hash = "sha256:624459daabeb310d3815b276c1adef475b3e6804abaf2d9d2c061c319f7f187d", size = 158709, upload-time = "2024-09-21T17:33:24.196Z" }, { url = "https://files.pythonhosted.org/packages/41/d8/63d6194aae711d7263df4498200c690a9c39fb437ede10f3e157a6343e0d/websockets-13.1-cp313-cp313-win_amd64.whl", hash = "sha256:c518e84bb59c2baae725accd355c8dc517b4a3ed8db88b4bc93c78dae2974bf2", size = 159144, upload-time = "2024-09-21T17:33:25.96Z" }, - { url = "https://files.pythonhosted.org/packages/83/69/59872420e5bce60db166d6fba39ee24c719d339fb0ae48cb2ce580129882/websockets-13.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c7934fd0e920e70468e676fe7f1b7261c1efa0d6c037c6722278ca0228ad9d0d", size = 157811, upload-time = "2024-09-21T17:33:27.379Z" }, - { url = "https://files.pythonhosted.org/packages/bb/f7/0610032e0d3981758fdd6ee7c68cc02ebf668a762c5178d3d91748228849/websockets-13.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:149e622dc48c10ccc3d2760e5f36753db9cacf3ad7bc7bbbfd7d9c819e286f23", size = 155471, upload-time = "2024-09-21T17:33:28.473Z" }, - { url = "https://files.pythonhosted.org/packages/55/2f/c43173a72ea395263a427a36d25bce2675f41c809424466a13c61a9a2d61/websockets-13.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a569eb1b05d72f9bce2ebd28a1ce2054311b66677fcd46cf36204ad23acead8c", size = 155713, upload-time = "2024-09-21T17:33:29.795Z" }, - { url = "https://files.pythonhosted.org/packages/92/7e/8fa930c6426a56c47910792717787640329e4a0e37cdfda20cf89da67126/websockets-13.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:95df24ca1e1bd93bbca51d94dd049a984609687cb2fb08a7f2c56ac84e9816ea", size = 164995, upload-time = "2024-09-21T17:33:30.802Z" }, - { url = "https://files.pythonhosted.org/packages/27/29/50ed4c68a3f606565a2db4b13948ae7b6f6c53aa9f8f258d92be6698d276/websockets-13.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d8dbb1bf0c0a4ae8b40bdc9be7f644e2f3fb4e8a9aca7145bfa510d4a374eeb7", size = 164057, upload-time = "2024-09-21T17:33:31.862Z" }, - { url = "https://files.pythonhosted.org/packages/3c/0e/60da63b1c53c47f389f79312b3356cb305600ffad1274d7ec473128d4e6b/websockets-13.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:035233b7531fb92a76beefcbf479504db8c72eb3bff41da55aecce3a0f729e54", size = 164340, upload-time = "2024-09-21T17:33:33.022Z" }, - { url = "https://files.pythonhosted.org/packages/20/ef/d87c5fc0aa7fafad1d584b6459ddfe062edf0d0dd64800a02e67e5de048b/websockets-13.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:e4450fc83a3df53dec45922b576e91e94f5578d06436871dce3a6be38e40f5db", size = 164222, upload-time = "2024-09-21T17:33:34.423Z" }, - { url = "https://files.pythonhosted.org/packages/f2/c4/7916e1f6b5252d3dcb9121b67d7fdbb2d9bf5067a6d8c88885ba27a9e69c/websockets-13.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:463e1c6ec853202dd3657f156123d6b4dad0c546ea2e2e38be2b3f7c5b8e7295", size = 163647, upload-time = "2024-09-21T17:33:35.841Z" }, - { url = "https://files.pythonhosted.org/packages/de/df/2ebebb807f10993c35c10cbd3628a7944b66bd5fb6632a561f8666f3a68e/websockets-13.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6d6855bbe70119872c05107e38fbc7f96b1d8cb047d95c2c50869a46c65a8e96", size = 163590, upload-time = "2024-09-21T17:33:37.61Z" }, - { url = "https://files.pythonhosted.org/packages/b5/82/d48911f56bb993c11099a1ff1d4041d9d1481d50271100e8ee62bc28f365/websockets-13.1-cp38-cp38-win32.whl", hash = "sha256:204e5107f43095012b00f1451374693267adbb832d29966a01ecc4ce1db26faf", size = 158701, upload-time = "2024-09-21T17:33:38.695Z" }, - { url = "https://files.pythonhosted.org/packages/8b/b3/945aacb21fc89ad150403cbaa974c9e846f098f16d9f39a3dd6094f9beb1/websockets-13.1-cp38-cp38-win_amd64.whl", hash = "sha256:485307243237328c022bc908b90e4457d0daa8b5cf4b3723fd3c4a8012fce4c6", size = 159146, upload-time = "2024-09-21T17:33:39.855Z" }, { url = "https://files.pythonhosted.org/packages/61/26/5f7a7fb03efedb4f90ed61968338bfe7c389863b0ceda239b94ae61c5ae4/websockets-13.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:9b37c184f8b976f0c0a231a5f3d6efe10807d41ccbe4488df8c74174805eea7d", size = 157810, upload-time = "2024-09-21T17:33:40.94Z" }, { url = "https://files.pythonhosted.org/packages/0e/d4/9b4814a07dffaa7a79d71b4944d10836f9adbd527a113f6675734ef3abed/websockets-13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:163e7277e1a0bd9fb3c8842a71661ad19c6aa7bb3d6678dc7f89b17fbcc4aeb7", size = 155467, upload-time = "2024-09-21T17:33:42.075Z" }, { url = "https://files.pythonhosted.org/packages/1a/1a/2abdc7ce3b56429ae39d6bfb48d8c791f5a26bbcb6f44aabcf71ffc3fda2/websockets-13.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4b889dbd1342820cc210ba44307cf75ae5f2f96226c0038094455a96e64fb07a", size = 155714, upload-time = "2024-09-21T17:33:43.128Z" }, @@ -1231,12 +1147,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fc/f5/6652fb82440813822022a9301a30afde85e5ff3fb2aebb77f34aabe2b4e8/websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcc03c8b72267e97b49149e4863d57c2d77f13fae12066622dc78fe322490fe6", size = 156701, upload-time = "2024-09-21T17:33:59.061Z" }, { url = "https://files.pythonhosted.org/packages/67/33/ae82a7b860fa8a08aba68818bdf7ff61f04598aa5ab96df4cd5a3e418ca4/websockets-13.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:004280a140f220c812e65f36944a9ca92d766b6cc4560be652a0a3883a79ed8a", size = 156654, upload-time = "2024-09-21T17:34:00.944Z" }, { url = "https://files.pythonhosted.org/packages/63/0b/a1b528d36934f833e20f6da1032b995bf093d55cb416b9f2266f229fb237/websockets-13.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:e2620453c075abeb0daa949a292e19f56de518988e079c36478bacf9546ced23", size = 159192, upload-time = "2024-09-21T17:34:02.656Z" }, - { url = "https://files.pythonhosted.org/packages/5e/a1/5ae6d0ef2e61e2b77b3b4678949a634756544186620a728799acdf5c3482/websockets-13.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9156c45750b37337f7b0b00e6248991a047be4aa44554c9886fe6bdd605aab3b", size = 155433, upload-time = "2024-09-21T17:34:03.88Z" }, - { url = "https://files.pythonhosted.org/packages/0d/2f/addd33f85600d210a445f817ff0d79d2b4d0eb6f3c95b9f35531ebf8f57c/websockets-13.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:80c421e07973a89fbdd93e6f2003c17d20b69010458d3a8e37fb47874bd67d51", size = 155733, upload-time = "2024-09-21T17:34:05.173Z" }, - { url = "https://files.pythonhosted.org/packages/74/0b/f8ec74ac3b14a983289a1b42dc2c518a0e2030b486d0549d4f51ca11e7c9/websockets-13.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82d0ba76371769d6a4e56f7e83bb8e81846d17a6190971e38b5de108bde9b0d7", size = 157093, upload-time = "2024-09-21T17:34:06.398Z" }, - { url = "https://files.pythonhosted.org/packages/ad/4c/aa5cc2f718ee4d797411202f332c8281f04c42d15f55b02f7713320f7a03/websockets-13.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9875a0143f07d74dc5e1ded1c4581f0d9f7ab86c78994e2ed9e95050073c94d", size = 156701, upload-time = "2024-09-21T17:34:07.582Z" }, - { url = "https://files.pythonhosted.org/packages/1f/4b/7c5b2d0d0f0f1a54f27c60107cf1f201bee1f88c5508f87408b470d09a9c/websockets-13.1-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a11e38ad8922c7961447f35c7b17bffa15de4d17c70abd07bfbe12d6faa3e027", size = 156648, upload-time = "2024-09-21T17:34:08.734Z" }, - { url = "https://files.pythonhosted.org/packages/f3/63/35f3fb073884a9fd1ce5413b2dcdf0d9198b03dac6274197111259cbde06/websockets-13.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:4059f790b6ae8768471cddb65d3c4fe4792b0ab48e154c9f0a04cefaabcd5978", size = 159188, upload-time = "2024-09-21T17:34:10.018Z" }, { url = "https://files.pythonhosted.org/packages/59/fd/e4bf9a7159dba6a16c59ae9e670e3e8ad9dcb6791bc0599eb86de32d50a9/websockets-13.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:25c35bf84bf7c7369d247f0b8cfa157f989862c49104c5cf85cb5436a641d93e", size = 155499, upload-time = "2024-09-21T17:34:11.3Z" }, { url = "https://files.pythonhosted.org/packages/74/42/d48ede93cfe0c343f3b552af08efc60778d234989227b16882eed1b8b189/websockets-13.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:83f91d8a9bb404b8c2c41a707ac7f7f75b9442a0a876df295de27251a856ad09", size = 155731, upload-time = "2024-09-21T17:34:13.151Z" }, { url = "https://files.pythonhosted.org/packages/f6/f2/2ef6bff1c90a43b80622a17c0852b48c09d3954ab169266ad7b15e17cdcb/websockets-13.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7a43cfdcddd07f4ca2b1afb459824dd3c6d53a51410636a2c7fc97b9a8cf4842", size = 157093, upload-time = "2024-09-21T17:34:14.52Z" }, From 8b20ea91bb60ae9dc2fe4ee3a68535030a942b61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 12 Jun 2025 12:58:05 -0700 Subject: [PATCH 03/43] Move classifiers from setup.py --- pyproject.toml | 5 +++++ setup.py | 7 ------- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2ef3a64..c0fc947 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,8 +14,13 @@ description = "Distributed web crawling with browsers" readme = "README.rst" requires-python = ">=3.9" classifiers = [ + "Development Status :: 5 - Production/Stable", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", "Operating System :: OS Independent", + "Environment :: Console", + "Topic :: Internet :: WWW/HTTP", + "Topic :: System :: Archiving", ] dependencies = [ "PyYAML>=5.1", diff --git a/setup.py b/setup.py index bd1b04b..c111310 100644 --- a/setup.py +++ b/setup.py @@ -46,11 +46,4 @@ setuptools.setup( "brozzler.dashboard": find_package_data("brozzler.dashboard"), }, zip_safe=False, - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "Programming Language :: Python :: 3.9", - "Topic :: Internet :: WWW/HTTP", - "Topic :: System :: Archiving", - ], ) From bee01d32b826c959d1f05d5d98a35555be6c1854 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 12 Jun 2025 13:07:21 -0700 Subject: [PATCH 04/43] deps: yt-dlp 2025.05.22 --- uv.lock | 60 +++++++++++++++++---------------------------------------- 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/uv.lock b/uv.lock index f7c6d2b..5185a18 100644 --- a/uv.lock +++ b/uv.lock @@ -110,7 +110,7 @@ name = "brotlicffi" version = "1.1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "cffi" }, + { name = "cffi", marker = "implementation_name != 'cpython'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/95/9d/70caa61192f570fcf0352766331b735afa931b4c6bc9a348a0925cc13288/brotlicffi-1.1.0.0.tar.gz", hash = "sha256:b77827a689905143f87915310b93b273ab17888fd43ef350d4832c4a71083c13", size = 465192, upload-time = "2023-09-14T14:22:40.707Z" } wheels = [ @@ -165,7 +165,7 @@ warcprox = [ { name = "warcprox" }, ] yt-dlp = [ - { name = "yt-dlp", extra = ["curl-cffi"] }, + { name = "yt-dlp", extra = ["curl-cffi", "default"] }, ] [package.dev-dependencies] @@ -428,41 +428,18 @@ wheels = [ name = "curl-cffi" version = "0.5.10" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and implementation_name == 'cpython' and os_name == 'nt'", - "python_full_version < '3.12' and implementation_name == 'cpython' and os_name == 'nt'", -] dependencies = [ - { name = "cffi", marker = "implementation_name == 'cpython' and os_name == 'nt'" }, + { name = "cffi", marker = "implementation_name == 'cpython'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/57/43d0e4475c336282deb7acf5285c35fbdce6a041d1583dbe30e0d761117d/curl_cffi-0.5.10.tar.gz", hash = "sha256:55bac4b73e2d80ceeaabea33270fc8ca6ace594128a46710242f2e688b4f8bfc", size = 35231, upload-time = "2023-11-25T10:07:20.473Z" } wheels = [ + { url = "https://files.pythonhosted.org/packages/30/18/72d6d4f5ec0db3ea68dde9ad775f9985ce22e699a47ab5b4cfcf8d74f3f5/curl_cffi-0.5.10-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:892603dab5e56fb72bfff7ae969136138971f63f63defe98232e1ec55cb0f1c6", size = 4930761, upload-time = "2023-11-25T10:07:08.441Z" }, + { url = "https://files.pythonhosted.org/packages/23/25/3c9fdb7e08e4722eb9e7ebcc59c74d7eaf32ca4a878a3fdf581030940431/curl_cffi-0.5.10-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:9937b8e13b1a6963c63e155b6621ec74649965105efedb919bc226fe731861cc", size = 2158498, upload-time = "2023-11-25T10:15:44.424Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/df2c475bae057af3fff4d288a388062032e7005ad312e8f55490faef82e2/curl_cffi-0.5.10-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b537595b9610a4dd0927c09823925b4e32b1ce0fd04385bfc5bb72ab830720e6", size = 6890322, upload-time = "2023-11-25T10:07:12.034Z" }, + { url = "https://files.pythonhosted.org/packages/93/04/0d57a51690451437f72563031036ea4f4b529bc0cfd8e8d93487075cbe80/curl_cffi-0.5.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b2bc8822d23415f6533c8b750475e9bbc76764025fe1dcb5866dc033607fd7b", size = 7208146, upload-time = "2023-11-25T10:07:15.331Z" }, { url = "https://files.pythonhosted.org/packages/4e/70/0d4d667287d54cb4daf8ce67068565c8e09e4f839a835119cd27f229a446/curl_cffi-0.5.10-cp37-abi3-win_amd64.whl", hash = "sha256:f9a1874b860c4e8db49bdfd9b9d4dc39999a1397d271ec78624c35c838e9e92a", size = 2643997, upload-time = "2023-11-25T10:07:18.005Z" }, ] -[[package]] -name = "curl-cffi" -version = "0.7.1" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and implementation_name == 'cpython' and os_name != 'nt'", - "python_full_version < '3.12' and implementation_name == 'cpython' and os_name != 'nt'", -] -dependencies = [ - { name = "certifi", marker = "implementation_name == 'cpython' and os_name != 'nt'" }, - { name = "cffi", marker = "implementation_name == 'cpython' and os_name != 'nt'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/63/31/00b346537ece64d835fc5d7310f89acee5318fdd9fa72247913274e28817/curl_cffi-0.7.1.tar.gz", hash = "sha256:8a64b12432146a3f178c4792c91188c18f50cc4b76e908ffc3206442c4610894", size = 133179, upload-time = "2024-07-13T09:07:44.938Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/11/0be8b95d21dc34811c68c45ab382ed20dfbfb4cbc8899f732d8b04651fd9/curl_cffi-0.7.1-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54edae42b25f30048fd6c2de06ed9df37bbe6ffdce14cc8a27c79f8c7d47977a", size = 5101433, upload-time = "2024-07-13T09:07:25.409Z" }, - { url = "https://files.pythonhosted.org/packages/78/67/8dd0c86435a3a7860fafbd72d572498e89a2ccc5ac2654bd3ebd26eecbb1/curl_cffi-0.7.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:5c347e221ddbbde2275aa7cde00933402638c2062a3984104f66b1bb20528545", size = 2482887, upload-time = "2024-07-13T09:07:28.656Z" }, - { url = "https://files.pythonhosted.org/packages/ae/61/8015cfcfdd7487f2cfeca90a18fd5504d29192b4700af93494d6915f508d/curl_cffi-0.7.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a5cc1d9ca59692cc5c175da0b397104283a0fea7515045fd22a7296296d82b", size = 5712868, upload-time = "2024-07-13T09:07:30.623Z" }, - { url = "https://files.pythonhosted.org/packages/d2/d5/0569f453994c94901e85392e72a5df268646e8f19b829a8c807ddd3b996f/curl_cffi-0.7.1-cp38-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0719fec4b5e1c300bf58411b1cea26cb91c44492fcf5a14ef684fe085f4d8b6e", size = 5518309, upload-time = "2024-07-13T09:07:32.741Z" }, - { url = "https://files.pythonhosted.org/packages/25/25/56cbb1fc9cff46999a11ccb75dab1ee62be629d6e871142c37d26ed460d7/curl_cffi-0.7.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e53ab76259b575017d3260854456ba6a3fbe31cee9b44edd275d4ea9f0f20e1", size = 6073331, upload-time = "2024-07-13T09:07:34.83Z" }, - { url = "https://files.pythonhosted.org/packages/49/68/66b047ee7664554d97adc3e4f566d7afcaf57a8d5979508002eecea0da09/curl_cffi-0.7.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:39d04ee1fc5f668ce53234051153031b3a3714300b772379e276565ad7cd244c", size = 6218360, upload-time = "2024-07-13T09:07:36.791Z" }, - { url = "https://files.pythonhosted.org/packages/56/08/3d03b147545cb14f4d1dfe24a581d6671522e465586fc57efeec4f9e5807/curl_cffi-0.7.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:e60f0dca3a55298898c62c21f0d8461e61aab96d033a7e9cead6160462728f7f", size = 6001290, upload-time = "2024-07-13T09:07:38.947Z" }, -] - [[package]] name = "doublethink" version = "0.4.9" @@ -1170,9 +1147,18 @@ wheels = [ [[package]] name = "yt-dlp" -version = "2024.10.22" +version = "2025.5.22" source = { registry = "https://pypi.org/simple" } -dependencies = [ +sdist = { url = "https://files.pythonhosted.org/packages/09/93/695cef32796dc7e76597e68a267a34a1b4e29bef8e12da445fa7c0ad1e55/yt_dlp-2025.5.22.tar.gz", hash = "sha256:ea73854c5dabc124f29a35a8fae9bc5d422ef3231bebeea2bdfa82ac191a9c29", size = 3017654, upload-time = "2025-05-22T09:58:35.694Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/68/4f108193ebce3ee7beb5f9a21daa6bc875e261150b510be468626f151959/yt_dlp-2025.5.22-py3-none-any.whl", hash = "sha256:a49c4b76afeaded6254c3e2b759d8d5a13271aa963d5fccb51fe059d1c313151", size = 3264137, upload-time = "2025-05-22T09:58:32.613Z" }, +] + +[package.optional-dependencies] +curl-cffi = [ + { name = "curl-cffi", marker = "implementation_name == 'cpython'" }, +] +default = [ { name = "brotli", marker = "implementation_name == 'cpython'" }, { name = "brotlicffi", marker = "implementation_name != 'cpython'" }, { name = "certifi" }, @@ -1182,16 +1168,6 @@ dependencies = [ { name = "urllib3" }, { name = "websockets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/2f/79/acfe1c2bf64ed83e1b465e6550c0f5bc2214ea447a900b102f5ca6e4186e/yt_dlp-2024.10.22.tar.gz", hash = "sha256:47b82a1fd22411b5c95ef2f0a1ae1af4e6dfd736ea99fdb2a0ea41445abc62ba", size = 2885622, upload-time = "2024-10-22T05:14:40.575Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/68/548f9819b41d53561d4f3d39588111cf39993c066b6e5300b4ae118eb2e6/yt_dlp-2024.10.22-py3-none-any.whl", hash = "sha256:ba166602ebe22a220e4dc1ead45bf00eb469ed812b22f4fb8bb54734f9b02084", size = 3155189, upload-time = "2024-10-22T05:14:37.631Z" }, -] - -[package.optional-dependencies] -curl-cffi = [ - { name = "curl-cffi", version = "0.5.10", source = { registry = "https://pypi.org/simple" }, marker = "implementation_name == 'cpython' and os_name == 'nt'" }, - { name = "curl-cffi", version = "0.7.1", source = { registry = "https://pypi.org/simple" }, marker = "implementation_name == 'cpython' and os_name != 'nt'" }, -] [[package]] name = "zipp" From d33df4028380f626b798f324040619e34445d0d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 12 Jun 2025 15:41:38 -0700 Subject: [PATCH 05/43] gitignore: ignore warcprox files These are created by some tests. --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 4cc3da8..34c63b1 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,8 @@ brozzler.egg-info/ # editor config .idea .vscode + +# warcprox +*.local-warcprox-ca* +/warcprox.sqlite +/warcs From 70e4c3d7f6eec65047786853b345bdba131e0918 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 25 Jun 2025 15:37:49 -0700 Subject: [PATCH 06/43] worker: fix possibly-unbound status code We assigned this inside an exception handler, and allow processing to continue on after catching the exception. --- brozzler/worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/brozzler/worker.py b/brozzler/worker.py index d1d48db..4d0c6ff 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -339,6 +339,7 @@ class BrozzlerWorker: raise brozzler.PageConnectionError() except brozzler.PageInterstitialShown: page_logger.info("page interstitial shown (http auth)") + status_code = -1 if enable_youtube_dl and self.should_ytdlp( page_logger, site, page, status_code From 422527d7e4908b98ce11b334836ab48c3ab63c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 25 Jun 2025 15:46:39 -0700 Subject: [PATCH 07/43] tests: ruff fixes --- scripts/ytdlp_test.py | 2 +- tests/test_frontier.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/ytdlp_test.py b/scripts/ytdlp_test.py index 1b15545..92f1076 100644 --- a/scripts/ytdlp_test.py +++ b/scripts/ytdlp_test.py @@ -14,7 +14,7 @@ def brozzle_page(worker, page) -> bool: # This gets assigned after a video is captured; if an # exception was raised by yt-dlp, it never gets assigned. - if not "videos" in page: + if "videos" not in page: return False if len(page.videos) > 0: diff --git a/tests/test_frontier.py b/tests/test_frontier.py index a25d11b..e5d5b79 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -944,7 +944,6 @@ def test_max_claimed_sites(rethinker): ], "max_claimed_sites": 3, } - seeds_seen = [] job = brozzler.new_job(frontier, job_conf) assert job.id From 0f2c166e2aa62a9083e8d914b7d24716dca3b331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 25 Jun 2025 16:23:57 -0700 Subject: [PATCH 08/43] tests: use github-format in ci --- Makefile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index aaaa5b9..74015c5 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,11 @@ BROZZLER_EGG_LINK = ./brozzler.egg-info ENV ?= LOCAL # Which package manager to use? Valid options: UV, PIP PACKAGE_MANAGER ?= UV +OUTPUT_FLAGS = + +ifeq ($(ENV),CI) + OUTPUT_FLAGS = --output-format=github +endif $(VIRTUAL_ENV_DIR): ifeq ($(PACKAGE_MANAGER),UV) @@ -37,14 +42,14 @@ clean: $(BROZZLER_EGG_LINK) .PHONY: check check: - $(VIRTUAL_ENV_DIR)/bin/ruff check --target-version py37 . + $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --target-version py37 . .PHONY: check-format check-format: - $(VIRTUAL_ENV_DIR)/bin/ruff check --select I --target-version py37 . + $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I --target-version py37 . $(VIRTUAL_ENV_DIR)/bin/ruff format --check --target-version py37 . .PHONY: format format: - $(VIRTUAL_ENV_DIR)/bin/ruff check --select I --target-version py37 --fix . + $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I --target-version py37 --fix . $(VIRTUAL_ENV_DIR)/bin/ruff format --target-version py37 . From a4e5418e131e9d1d05f4bf5b5f1e7ce6dd5e0a3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 25 Jun 2025 16:26:29 -0700 Subject: [PATCH 09/43] tests: enable format check --- .github/workflows/python-formatting.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-formatting.yml b/.github/workflows/python-formatting.yml index 3d58462..179fbe8 100644 --- a/.github/workflows/python-formatting.yml +++ b/.github/workflows/python-formatting.yml @@ -27,5 +27,8 @@ jobs: ./.venv/bin/pip install --upgrade pip ./.venv/bin/pip install ruff + - name: Run check + run: make check + - name: Run formatting check run: make check-format From f9848efc1e02280e250ddd2d86ad1763ff500a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 25 Jun 2025 16:27:54 -0700 Subject: [PATCH 10/43] tests: recognize CI=true --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 74015c5..dca4558 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,12 @@ VIRTUAL_ENV_DIR = .venv BROZZLER_EGG_LINK = ./brozzler.egg-info # Where's the Makefile running? Valid options: LOCAL, CI ENV ?= LOCAL + +# GitHub Actions sets CI=true +ifeq ($(CI),true) + ENV = CI +endif + # Which package manager to use? Valid options: UV, PIP PACKAGE_MANAGER ?= UV OUTPUT_FLAGS = From 38f164dbc4365fcc246c3804491ce2758f09e725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Wed, 25 Jun 2025 17:20:09 -0700 Subject: [PATCH 11/43] Makefile: remove target-version This can be inferred from our pyproject.toml. --- Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index dca4558..409d8d9 100644 --- a/Makefile +++ b/Makefile @@ -48,14 +48,14 @@ clean: $(BROZZLER_EGG_LINK) .PHONY: check check: - $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --target-version py37 . + $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) . .PHONY: check-format check-format: - $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I --target-version py37 . - $(VIRTUAL_ENV_DIR)/bin/ruff format --check --target-version py37 . + $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I . + $(VIRTUAL_ENV_DIR)/bin/ruff format --check . .PHONY: format format: - $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I --target-version py37 --fix . - $(VIRTUAL_ENV_DIR)/bin/ruff format --target-version py37 . + $(VIRTUAL_ENV_DIR)/bin/ruff check $(OUTPUT_FLAGS) --select I --fix . + $(VIRTUAL_ENV_DIR)/bin/ruff format . From 5ff893ddaf95e9149704bccab2ad2f1c502ac2eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 26 Jun 2025 14:27:13 -0700 Subject: [PATCH 12/43] brozzler-new-site: add flag to disable videos This makes it easier to test the new video exclusion work. --- brozzler/cli.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/brozzler/cli.py b/brozzler/cli.py index 55aaa95..c7d6892 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -39,6 +39,7 @@ import yaml import brozzler import brozzler.worker from brozzler import suggest_default_chrome_exe +from brozzler.model import VideoCaptureOptions r = rdb.RethinkDB() @@ -500,11 +501,22 @@ def brozzler_new_site(argv=None): default=None, help="use this password to try to log in if a login form is found", ) + arg_parser.add_argument( + "--disable-video-capture", + dest="disable_video", + action="store_true", + help="disable video capture for this site", + ) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) + if args.disable_video: + video_capture = VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value + else: + video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value + rr = rethinker(args) site = brozzler.Site( rr, @@ -522,6 +534,7 @@ def brozzler_new_site(argv=None): ), "username": args.username, "password": args.password, + "video_capture": video_capture, }, ) From a0f60c1051ce4d7bf9f24b579c70eeaf994e5ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 26 Jun 2025 09:23:55 -0700 Subject: [PATCH 13/43] Video exclusion: skip YouTube UMP packets too In testing a page with an embedded YouTube video with video exclusion enabled, I found that brozzler ended up capturing about 30MB of UMP packets. We should be filtering those out too. --- brozzler/worker.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 4d0c6ff..055e13a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -412,8 +412,10 @@ class BrozzlerWorker: Determines if the page's Content-Type header specifies that it contains a video. """ - return ( - "content-type" in page_headers and "video" in page_headers["content-type"] + return "content-type" in page_headers and ( + "video" in page_headers["content-type"] + # https://github.com/gsuberland/UMP_Format/blob/main/UMP_Format.md + or page_headers["content-type"] == "application/vnd.yt-ump" ) def _is_pdf(self, page_headers) -> bool: From 7b691fe397b594fb32ea5237bd33da4652d7ebee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Mon, 7 Jul 2025 14:26:03 -0700 Subject: [PATCH 14/43] worker: skip audio content-types for media exclusion --- brozzler/worker.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 055e13a..7e2b254 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -321,9 +321,9 @@ class BrozzlerWorker: elif site.video_capture in [ VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value, VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value, - ] and self._is_video_type(page_headers): + ] and self._is_media_type(page_headers): page_logger.info( - "skipping video content: video MIME type capture disabled for site" + "skipping audio/video content: video MIME type capture disabled for site" ) else: self._fetch_url(site, page=page) @@ -407,13 +407,14 @@ class BrozzlerWorker: and "html" not in page_headers["content-type"] ) - def _is_video_type(self, page_headers) -> bool: + def _is_media_type(self, page_headers) -> bool: """ Determines if the page's Content-Type header specifies that it contains - a video. + audio or video. """ return "content-type" in page_headers and ( "video" in page_headers["content-type"] + or "audio" in page_headers["content-type"] # https://github.com/gsuberland/UMP_Format/blob/main/UMP_Format.md or page_headers["content-type"] == "application/vnd.yt-ump" ) From aea4286bd1a6672519a2d63afae7dd47c0cec6c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 10 Jul 2025 09:29:05 -0700 Subject: [PATCH 15/43] ci: use uv --- .github/workflows/setup/action.yml | 8 +++++--- .github/workflows/tests.yml | 9 +++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml index 8367704..a0203f0 100644 --- a/.github/workflows/setup/action.yml +++ b/.github/workflows/setup/action.yml @@ -1,5 +1,9 @@ name: Test setup +inputs: + python-version: + required: true + runs: using: composite steps: @@ -21,7 +25,5 @@ runs: - name: Install pip dependencies run: | - pip install .[rethinkdb,warcprox,yt-dlp] - # setuptools required by rethinkdb==2.4.9 - pip install pytest setuptools + uv sync --python ${{ inputs.python-version }} --extra rethinkdb --extra warcprox --extra yt-dlp shell: bash diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4cb2028..5a3b682 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,12 +20,13 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - name: Install uv + uses: astral-sh/setup-uv@v6 + + - uses: ./.github/workflows/setup with: python-version: ${{ matrix.version }} - - uses: ./.github/workflows/setup - - name: Run tests run: | - py.test --tb=native --verbose tests/test_cli.py tests/test_units.py + uv run py.test --tb=native --verbose tests/test_cli.py tests/test_units.py From f9cc2ea48ebc5c873549b9a6061d5ac520e76183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Misty=20De=20M=C3=A9o?= Date: Thu, 10 Jul 2025 09:41:42 -0700 Subject: [PATCH 16/43] ci: test with 3.14 beta 3.14 beta 4 is very late in the cycle, so it seems like a good time for us to start testing with it to make sure we're ready. --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5a3b682..b48e3c2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - version: ['3.9', '3.12'] + version: ['3.9', '3.12', '3.14'] steps: - uses: actions/checkout@v4 From 55e446a41a8014572475a67237748beb3f4e6546 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 20 Jun 2025 17:33:01 -0700 Subject: [PATCH 17/43] initial commit --- brozzler/job_schema.yaml | 6 ++++++ brozzler/model.py | 7 ++++++- brozzler/ydl.py | 33 +++++++++++++++++++++++++++------ pyproject.toml | 1 + 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 59b831f..4f7afe2 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -4,6 +4,12 @@ id: - integer required: false +account_id: + type: + - string + - integer + required: false + <<: &multi_level_options time_limit: type: number diff --git a/brozzler/model.py b/brozzler/model.py index eab4b1c..6d666d5 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -2,7 +2,7 @@ brozzler/models.py - model classes representing jobs, sites, and pages, with related logic -Copyright (C) 2014-2024 Internet Archive +Copyright (C) 2014-2025 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -100,6 +100,10 @@ def new_job(frontier, job_conf): ) if "id" in job_conf: job.id = job_conf["id"] + if "account_id" in job_conf: + job.account_id = job_conf["account_id"] + else: + job.account_id = None if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] if "pdfs_only" in job_conf: @@ -115,6 +119,7 @@ def new_job(frontier, job_conf): merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) site.id = str(uuid.uuid4()) + site.account_id = job.account_id sites.append(site) pages.append(new_seed_page(frontier, site)) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b3e19fe..3edeee6 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -418,6 +418,23 @@ def _try_youtube_dl(worker, ydl, site, page): return ie_result +def get_video_captures(site, source=None): + import psycopg + # todo: read pg_url from environment var + pg_url = "postgresql://ait_crawling:archive-it-crawling@db.qa-archive-it.org/ait_crawling" + account_id = site.account_id if site.account_id else None + seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None + if account_id and seed and source: + pg_query = ("SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like '%'+%s+'%'", (account_id, seed, source,)) + elif seed: + pg_query = ("SELECT containing_page_url from video where seed = %s and containing_page_url like '%'+%s+'%'", (seed, source)) + else: + return None + with psycopg.connect(pg_url) as conn: + with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: + cur.execute(pg_query) + return cur.fetchall() + @metrics.brozzler_ytdlp_duration_seconds.time() @metrics.brozzler_in_progress_ytdlps.track_inprogress() def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): @@ -444,10 +461,14 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): ie_result.get("extractor") == "youtube:playlist" or ie_result.get("extractor") == "youtube:tab" ): - # youtube watch pages as outlinks - outlinks = { - "https://www.youtube.com/watch?v=%s" % e["id"] - for e in ie_result.get("entries_no_dl", []) - } - # any outlinks for other cases? soundcloud, maybe? + captured_youtube_watch_pages = get_video_captures(site, source="youtube") + uncaptured_youtube_watch_pages = [] + for e in ie_result.get("entries_no_dl", []): + youtube_watch_url = f"https://www.youtube.com/watch?v={e["id"]}" + if youtube_watch_url in captured_youtube_watch_pages: + continue + uncaptured_youtube_watch_pages.append(youtube_watch_url) + if uncaptured_youtube_watch_pages: + outlinks.add(uncaptured_youtube_watch_pages) + # todo: handle outlinks for instagram and soundcloud here (if anywhere) return outlinks diff --git a/pyproject.toml b/pyproject.toml index c0fc947..8e40fd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "python-magic>=0.4.15", "prometheus-client>=0.20.0", "structlog>=25.1.0", + "psycopg[binary]>=3.2.6", ] license = "Apache-2.0" From 03b329cd2a4ae4090cbd73fa27b3909dbd6e7a67 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 20 Jun 2025 17:37:40 -0700 Subject: [PATCH 18/43] formatting fix --- brozzler/ydl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 3edeee6..4c3bf12 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -464,7 +464,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): captured_youtube_watch_pages = get_video_captures(site, source="youtube") uncaptured_youtube_watch_pages = [] for e in ie_result.get("entries_no_dl", []): - youtube_watch_url = f"https://www.youtube.com/watch?v={e["id"]}" + youtube_watch_url = f"https://www.youtube.com/watch?v={e['id']}" if youtube_watch_url in captured_youtube_watch_pages: continue uncaptured_youtube_watch_pages.append(youtube_watch_url) From 92a6cacb5f29b80f3c9532a069dc0fda7bc15491 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 20 Jun 2025 17:49:07 -0700 Subject: [PATCH 19/43] ruff format updates --- brozzler/ydl.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4c3bf12..b406311 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -420,14 +420,25 @@ def _try_youtube_dl(worker, ydl, site, page): def get_video_captures(site, source=None): import psycopg + # todo: read pg_url from environment var pg_url = "postgresql://ait_crawling:archive-it-crawling@db.qa-archive-it.org/ait_crawling" account_id = site.account_id if site.account_id else None seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None if account_id and seed and source: - pg_query = ("SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like '%'+%s+'%'", (account_id, seed, source,)) + pg_query = ( + "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like '%'+%s+'%'", + ( + account_id, + seed, + source, + ), + ) elif seed: - pg_query = ("SELECT containing_page_url from video where seed = %s and containing_page_url like '%'+%s+'%'", (seed, source)) + pg_query = ( + "SELECT containing_page_url from video where seed = %s and containing_page_url like '%'+%s+'%'", + (seed, source), + ) else: return None with psycopg.connect(pg_url) as conn: @@ -435,6 +446,7 @@ def get_video_captures(site, source=None): cur.execute(pg_query) return cur.fetchall() + @metrics.brozzler_ytdlp_duration_seconds.time() @metrics.brozzler_in_progress_ytdlps.track_inprogress() def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): From c0db5b94030dd36d379b7a7fb10db57df3f78042 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 20 Jun 2025 17:57:03 -0700 Subject: [PATCH 20/43] CONCAT --- brozzler/ydl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b406311..728b395 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -427,7 +427,7 @@ def get_video_captures(site, source=None): seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None if account_id and seed and source: pg_query = ( - "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like '%'+%s+'%'", + "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like CONCAT('%',%s,'%')", ( account_id, seed, @@ -436,7 +436,7 @@ def get_video_captures(site, source=None): ) elif seed: pg_query = ( - "SELECT containing_page_url from video where seed = %s and containing_page_url like '%'+%s+'%'", + "SELECT containing_page_url from video where seed = %s and containing_page_url like CONCAT('%',%s,'%')", (seed, source), ) else: From fd0e0d3f30c473e665893151a6e36b1b149de4b6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 22 Jun 2025 20:28:48 -0700 Subject: [PATCH 21/43] variable VIDEO_DATA --- brozzler/ydl.py | 58 +++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 728b395..01454a3 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -42,6 +42,8 @@ PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 +VIDEO_DATA = "" + logger = structlog.get_logger(logger_name=__name__) @@ -418,33 +420,37 @@ def _try_youtube_dl(worker, ydl, site, page): return ie_result -def get_video_captures(site, source=None): - import psycopg - - # todo: read pg_url from environment var - pg_url = "postgresql://ait_crawling:archive-it-crawling@db.qa-archive-it.org/ait_crawling" - account_id = site.account_id if site.account_id else None - seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None - if account_id and seed and source: - pg_query = ( - "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like CONCAT('%',%s,'%')", - ( - account_id, - seed, - source, - ), - ) - elif seed: - pg_query = ( - "SELECT containing_page_url from video where seed = %s and containing_page_url like CONCAT('%',%s,'%')", - (seed, source), - ) - else: +def get_video_captures(site, source="youtube"): + if not VIDEO_DATA: return None - with psycopg.connect(pg_url) as conn: - with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: - cur.execute(pg_query) - return cur.fetchall() + + if VIDEO_DATA and VIDEO_DATA.startswith("postgresql"): + import psycopg + + pg_url = VIDEO_DATA + account_id = site.account_id if site.account_id else None + seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None + containing_page_url_pattern = "http://youtube.com/watch" if source == "youtube" + if account_id and seed and source: + pg_query = ( + "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like %s", + ( + account_id, + seed, + containing_page_url_pattern, + ), + ) + elif seed and source: + pg_query = ( + "SELECT containing_page_url from video where seed = %s and containing_page_url like %s", + (seed, containing_page_url_pattern), + ) + else: + return None + with psycopg.connect(pg_url) as conn: + with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: + cur.execute(pg_query) + return cur.fetchall() @metrics.brozzler_ytdlp_duration_seconds.time() From f925660eb40a1e5122d940c83879d828c51047eb Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 22 Jun 2025 20:46:33 -0700 Subject: [PATCH 22/43] skip ternary op for now --- brozzler/ydl.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 01454a3..92972b5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -430,7 +430,11 @@ def get_video_captures(site, source="youtube"): pg_url = VIDEO_DATA account_id = site.account_id if site.account_id else None seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None - containing_page_url_pattern = "http://youtube.com/watch" if source == "youtube" + if source == "youtube": + containing_page_url_pattern = "http://youtube.com/watch" + # support other sources here + else: + containing_page_url_pattern = None if account_id and seed and source: pg_query = ( "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like %s", From fe5ad0c31d7a1b06d78ce74517b29c7cdd5875a4 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 16:46:37 -0700 Subject: [PATCH 23/43] VIDEO_DATA_SOURCE --- brozzler/ydl.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 92972b5..b52ebf7 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -42,7 +42,7 @@ PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 -VIDEO_DATA = "" +VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") logger = structlog.get_logger(logger_name=__name__) @@ -421,23 +421,22 @@ def _try_youtube_dl(worker, ydl, site, page): def get_video_captures(site, source="youtube"): - if not VIDEO_DATA: + if not VIDEO_DATA_SOURCE: return None - if VIDEO_DATA and VIDEO_DATA.startswith("postgresql"): + if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): import psycopg - pg_url = VIDEO_DATA account_id = site.account_id if site.account_id else None seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None if source == "youtube": - containing_page_url_pattern = "http://youtube.com/watch" + containing_page_url_pattern = "http://youtube.com/watch" # yes, video data canonicalization uses "http" # support other sources here else: containing_page_url_pattern = None if account_id and seed and source: pg_query = ( - "SELECT containing_page_url from video where account_id = %s and seed = %s and containing_page_url like %s", + "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_url like %s", ( account_id, seed, @@ -451,10 +450,11 @@ def get_video_captures(site, source="youtube"): ) else: return None - with psycopg.connect(pg_url) as conn: + with psycopg.connect(VIDEO_DATA_SOURCE) as conn: with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: cur.execute(pg_query) return cur.fetchall() + return None @metrics.brozzler_ytdlp_duration_seconds.time() From 203d86f40289a878ee9eaa478b063675992cda11 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 17:13:45 -0700 Subject: [PATCH 24/43] use job_conf.get() --- brozzler/model.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 6d666d5..d7c1ac2 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -98,16 +98,10 @@ def new_job(frontier, job_conf): frontier.rr, {"conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()}, ) - if "id" in job_conf: - job.id = job_conf["id"] - if "account_id" in job_conf: - job.account_id = job_conf["account_id"] - else: - job.account_id = None - if "max_claimed_sites" in job_conf: - job.max_claimed_sites = job_conf["max_claimed_sites"] - if "pdfs_only" in job_conf: - job.pdfs_only = job_conf["pdfs_only"] + job.id = job_conf.get("id") + job.account_id = job_conf.get("account_id") + job.max_claimed_sites = job_conf.get("max_claimed_sites") + job.pdfs_only = job_conf.get("pdfs_only") job.save() sites = [] From 0526eb816c864bc4255b4242020f66fffa282483 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 17:18:59 -0700 Subject: [PATCH 25/43] make psycopg dependency optional --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8e40fd5..e8ac6af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,12 +35,12 @@ dependencies = [ "python-magic>=0.4.15", "prometheus-client>=0.20.0", "structlog>=25.1.0", - "psycopg[binary]>=3.2.6", ] license = "Apache-2.0" [project.optional-dependencies] yt-dlp = ["yt-dlp[default,curl-cffi]>=2024.7.25"] +psycopg = ["psycopg[binary]>=3.2.6"] dashboard = ["flask>=1.0", "gunicorn>=19.8.1"] warcprox = ["warcprox>=2.4.31"] rethinkdb = [ From af1aaeee34a2aee3a915d9478933cea786588156 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 17:24:43 -0700 Subject: [PATCH 26/43] containing_page_url_pattern update --- brozzler/ydl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index b52ebf7..e474335 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -430,7 +430,7 @@ def get_video_captures(site, source="youtube"): account_id = site.account_id if site.account_id else None seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None if source == "youtube": - containing_page_url_pattern = "http://youtube.com/watch" # yes, video data canonicalization uses "http" + containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" # support other sources here else: containing_page_url_pattern = None From 8dcac47ae8df3b07d893703e581735eb4020ae2a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 17:39:07 -0700 Subject: [PATCH 27/43] type hint get_video_captures --- brozzler/ydl.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e474335..2d0b0cb 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -27,6 +27,7 @@ import urllib.request import doublethink import structlog +from typing import List import urlcanon import yt_dlp from yt_dlp.utils import ExtractorError, match_filter_func @@ -420,9 +421,9 @@ def _try_youtube_dl(worker, ydl, site, page): return ie_result -def get_video_captures(site, source="youtube"): +def get_video_captures(site, source="youtube") -> List[str]: if not VIDEO_DATA_SOURCE: - return None + return [] if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): import psycopg @@ -449,12 +450,12 @@ def get_video_captures(site, source="youtube"): (seed, containing_page_url_pattern), ) else: - return None + return [] with psycopg.connect(VIDEO_DATA_SOURCE) as conn: with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: cur.execute(pg_query) return cur.fetchall() - return None + return [] @metrics.brozzler_ytdlp_duration_seconds.time() @@ -483,7 +484,8 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): ie_result.get("extractor") == "youtube:playlist" or ie_result.get("extractor") == "youtube:tab" ): - captured_youtube_watch_pages = get_video_captures(site, source="youtube") + captured_youtube_watch_pages = set() + captured_youtube_watch_pages.add(get_video_captures(site, source="youtube")) uncaptured_youtube_watch_pages = [] for e in ie_result.get("entries_no_dl", []): youtube_watch_url = f"https://www.youtube.com/watch?v={e['id']}" From 667feae5592146f17c8a8fae17c1724796e5bdfe Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 17:50:43 -0700 Subject: [PATCH 28/43] ruff import block fix --- brozzler/ydl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 2d0b0cb..e514e60 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -24,10 +24,10 @@ import tempfile import threading import time import urllib.request +from typing import List import doublethink import structlog -from typing import List import urlcanon import yt_dlp from yt_dlp.utils import ExtractorError, match_filter_func From f21d312ca99e5b6b235f4f4137eed7c5a0ec1f1c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 23 Jun 2025 19:43:38 -0700 Subject: [PATCH 29/43] initial interface update --- brozzler/ydl.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e514e60..f60c585 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -421,41 +421,41 @@ def _try_youtube_dl(worker, ydl, site, page): return ie_result -def get_video_captures(site, source="youtube") -> List[str]: - if not VIDEO_DATA_SOURCE: - return [] +class VideoDataClient: + import psycopg + from psycopg_pool import ConnectionPool - if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - import psycopg + def __init__(self, site=None): + if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): + self.pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) + self.account_id = site.account_id if site.account_id else None + self.seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None - account_id = site.account_id if site.account_id else None - seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None + def get_video_captures_from_db(self, source="youtube") -> List[str]: if source == "youtube": containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" # support other sources here else: containing_page_url_pattern = None - if account_id and seed and source: + if self.account_id and self.seed and source: pg_query = ( "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_url like %s", ( - account_id, - seed, + self.account_id, + self.seed, containing_page_url_pattern, ), ) - elif seed and source: + elif self.seed and source: pg_query = ( "SELECT containing_page_url from video where seed = %s and containing_page_url like %s", - (seed, containing_page_url_pattern), + (self.seed, containing_page_url_pattern), ) - else: - return [] - with psycopg.connect(VIDEO_DATA_SOURCE) as conn: + + with self.pool.connection() as conn: with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: cur.execute(pg_query) return cur.fetchall() - return [] @metrics.brozzler_ytdlp_duration_seconds.time() @@ -485,7 +485,9 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): or ie_result.get("extractor") == "youtube:tab" ): captured_youtube_watch_pages = set() - captured_youtube_watch_pages.add(get_video_captures(site, source="youtube")) + captured_youtube_watch_pages.add( + VideoDataClient.get_video_captures(site, source="youtube") + ) uncaptured_youtube_watch_pages = [] for e in ie_result.get("entries_no_dl", []): youtube_watch_url = f"https://www.youtube.com/watch?v={e['id']}" From de4e7e0c0845257a1ea5d0c5da69cd9f3d807096 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 25 Jun 2025 18:17:56 -0700 Subject: [PATCH 30/43] VideoDataWrapper refined --- brozzler/ydl.py | 73 ++++++++++++++++++++++++++++++------------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f60c585..57e5553 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -421,41 +421,56 @@ def _try_youtube_dl(worker, ydl, site, page): return ie_result -class VideoDataClient: +class VideoDataWrapper: import psycopg from psycopg_pool import ConnectionPool - def __init__(self, site=None): + def __init__(self): if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - self.pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) - self.account_id = site.account_id if site.account_id else None - self.seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None + pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) + pool.wait() + logger.info("pg pool ready") + self.pool = pool + atexit.register(pool.close) + + def _execute_query( + self, query: str, row_factory=None, fetchone=False, fetchall=False + ) -> Optional[Any]: + with self.pool.connection() as conn: + with conn.cursor(row_factory=row_factory) as cur: + cur.execute(pg_query) + if fetchone: + return cur.fetchone() + if fetchall: + return cur.fetchall() + + def get_pg_video_captures(self, site=None, source="youtube") -> List[str]: + account_id = site.account_id if site.account_id else None + seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None - def get_video_captures_from_db(self, source="youtube") -> List[str]: if source == "youtube": containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" # support other sources here else: containing_page_url_pattern = None - if self.account_id and self.seed and source: + if account_id and seed and source: pg_query = ( "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_url like %s", ( - self.account_id, - self.seed, + account_id, + seed, containing_page_url_pattern, ), ) - elif self.seed and source: + elif seed and source: pg_query = ( "SELECT containing_page_url from video where seed = %s and containing_page_url like %s", - (self.seed, containing_page_url_pattern), + (seed, containing_page_url_pattern), ) - - with self.pool.connection() as conn: - with conn.cursor(row_factory=psycopg.rows.scalar_row) as cur: - cur.execute(pg_query) - return cur.fetchall() + results = self._execute_query( + pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True + ) + return results @metrics.brozzler_ytdlp_duration_seconds.time() @@ -484,17 +499,19 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): ie_result.get("extractor") == "youtube:playlist" or ie_result.get("extractor") == "youtube:tab" ): - captured_youtube_watch_pages = set() - captured_youtube_watch_pages.add( - VideoDataClient.get_video_captures(site, source="youtube") - ) - uncaptured_youtube_watch_pages = [] - for e in ie_result.get("entries_no_dl", []): - youtube_watch_url = f"https://www.youtube.com/watch?v={e['id']}" - if youtube_watch_url in captured_youtube_watch_pages: - continue - uncaptured_youtube_watch_pages.append(youtube_watch_url) - if uncaptured_youtube_watch_pages: - outlinks.add(uncaptured_youtube_watch_pages) + if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): + video_data = VideoDataWrapper() + captured_youtube_watch_pages = set() + captured_youtube_watch_pages.add( + video_data.get_pg_video_captures(site, source="youtube") + ) + uncaptured_youtube_watch_pages = [] + for e in ie_result.get("entries_no_dl", []): + youtube_watch_url = f"https://www.youtube.com/watch?v={e['id']}" + if youtube_watch_url in captured_youtube_watch_pages: + continue + uncaptured_youtube_watch_pages.append(youtube_watch_url) + if uncaptured_youtube_watch_pages: + outlinks.add(uncaptured_youtube_watch_pages) # todo: handle outlinks for instagram and soundcloud here (if anywhere) return outlinks From 046db4b6cc739b9b1ddcf76258aa4f2b4212570a Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Sun, 29 Jun 2025 19:01:20 -0700 Subject: [PATCH 31/43] VideoDataClient, generalized --- brozzler/worker.py | 1 + brozzler/ydl.py | 133 ++++++++++++++++++++++++++------------------- 2 files changed, 79 insertions(+), 55 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 7e2b254..983f89d 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -39,6 +39,7 @@ from urllib3.exceptions import ProxyError, TimeoutError import brozzler import brozzler.browser from brozzler.model import VideoCaptureOptions +from brozzler.ydl import VideoDataClient from . import metrics diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 57e5553..423dfe8 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -49,6 +49,81 @@ VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") logger = structlog.get_logger(logger_name=__name__) +class VideoDataClient: + def __init__(self): + if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): + import psycopg + from psycopg_pool import ConnectionPool + + pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) + pool.wait() + logger.info("pg pool ready") + atexit.register(pool.close) + + self.pool = pool + + def _execute_pg_query( + self, query: str, row_factory=None, fetchone=False, fetchall=False + ) -> Optional[Any]: + with self.pool.connection() as conn: + with conn.cursor(row_factory=row_factory) as cur: + cur.execute(query) + if fetchone: + return cur.fetchone() + if fetchall: + return cur.fetchall() + + def get_pg_video_captures(self, site=None, source=None) -> List[str]: + account_id = site.account_id if site.account_id else None + seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None + + # TODO: generalize, maybe make variable? + containing_page_timestamp_pattern = "2025%" + + if source == "youtube": + containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" + # support other media sources here + + if account_id and seed and source: + pg_query = ( + "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_url like %s", + ( + account_id, + seed, + containing_page_url_pattern, + ), + ) + elif account_id and seed: + pg_query = ( + "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_timestamp like %s", + ( + account_id, + seed, + containing_page_timestamp_pattern, + ), + ) + elif seed and source: + pg_query = ( + "SELECT distinct(containing_page_url) from video where seed = %s and containing_page_url like %s", + (seed, containing_page_url_pattern), + ) + elif seed: + pg_query = ( + "SELECT distinct(containing_page_url) from video where seed = %s and containing_page_timestamp like %s", + ( + seed, + containing_page_timestamp_pattern, + ), + ) + try: + results = self._execute_query( + pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True + ) + except Exception as e: + logger.warn("postgres query failed: %s", e) + return results + + def isyoutubehost(url): # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname return "youtube.com" in url.split("//")[-1].split("/")[0].split("?")[0] @@ -421,58 +496,6 @@ def _try_youtube_dl(worker, ydl, site, page): return ie_result -class VideoDataWrapper: - import psycopg - from psycopg_pool import ConnectionPool - - def __init__(self): - if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) - pool.wait() - logger.info("pg pool ready") - self.pool = pool - atexit.register(pool.close) - - def _execute_query( - self, query: str, row_factory=None, fetchone=False, fetchall=False - ) -> Optional[Any]: - with self.pool.connection() as conn: - with conn.cursor(row_factory=row_factory) as cur: - cur.execute(pg_query) - if fetchone: - return cur.fetchone() - if fetchall: - return cur.fetchall() - - def get_pg_video_captures(self, site=None, source="youtube") -> List[str]: - account_id = site.account_id if site.account_id else None - seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None - - if source == "youtube": - containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" - # support other sources here - else: - containing_page_url_pattern = None - if account_id and seed and source: - pg_query = ( - "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_url like %s", - ( - account_id, - seed, - containing_page_url_pattern, - ), - ) - elif seed and source: - pg_query = ( - "SELECT containing_page_url from video where seed = %s and containing_page_url like %s", - (seed, containing_page_url_pattern), - ) - results = self._execute_query( - pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True - ) - return results - - @metrics.brozzler_ytdlp_duration_seconds.time() @metrics.brozzler_in_progress_ytdlps.track_inprogress() def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): @@ -500,10 +523,10 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): or ie_result.get("extractor") == "youtube:tab" ): if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - video_data = VideoDataWrapper() + video_data = VideoDataClient() captured_youtube_watch_pages = set() captured_youtube_watch_pages.add( - video_data.get_pg_video_captures(site, source="youtube") + video_data.get_pg_video_captures_by_source(site, source="youtube") ) uncaptured_youtube_watch_pages = [] for e in ie_result.get("entries_no_dl", []): @@ -513,5 +536,5 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): uncaptured_youtube_watch_pages.append(youtube_watch_url) if uncaptured_youtube_watch_pages: outlinks.add(uncaptured_youtube_watch_pages) - # todo: handle outlinks for instagram and soundcloud here (if anywhere) + # todo: handle outlinks for instagram and soundcloud, other media source, here (if anywhere) return outlinks From 7d58a9ae3b7708dc42c874a0ef72f9cec216e4fc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 30 Jun 2025 14:38:21 -0700 Subject: [PATCH 32/43] keep it simple for now --- brozzler/ydl.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 423dfe8..a5541f0 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -72,13 +72,14 @@ class VideoDataClient: return cur.fetchone() if fetchall: return cur.fetchall() + return None def get_pg_video_captures(self, site=None, source=None) -> List[str]: account_id = site.account_id if site.account_id else None seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None # TODO: generalize, maybe make variable? - containing_page_timestamp_pattern = "2025%" + containing_page_timestamp_pattern = "2025%" # for future pre-dup additions if source == "youtube": containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" @@ -93,34 +94,18 @@ class VideoDataClient: containing_page_url_pattern, ), ) - elif account_id and seed: - pg_query = ( - "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_timestamp like %s", - ( - account_id, - seed, - containing_page_timestamp_pattern, - ), - ) - elif seed and source: + elif seed and source: # account_id should usually be present pg_query = ( "SELECT distinct(containing_page_url) from video where seed = %s and containing_page_url like %s", (seed, containing_page_url_pattern), ) - elif seed: - pg_query = ( - "SELECT distinct(containing_page_url) from video where seed = %s and containing_page_timestamp like %s", - ( - seed, - containing_page_timestamp_pattern, - ), - ) try: results = self._execute_query( pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True ) except Exception as e: logger.warn("postgres query failed: %s", e) + results = [] return results From db17335ffb7fd3268a69da5ae84564d092f71e6d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 30 Jun 2025 15:10:14 -0700 Subject: [PATCH 33/43] fix github ruff issues --- .github/workflows/setup/action.yml | 4 +++- brozzler/worker.py | 1 - brozzler/ydl.py | 32 +++++++++++++++++------------- pyproject.toml | 4 ++-- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml index a0203f0..58b2d1f 100644 --- a/.github/workflows/setup/action.yml +++ b/.github/workflows/setup/action.yml @@ -25,5 +25,7 @@ runs: - name: Install pip dependencies run: | - uv sync --python ${{ inputs.python-version }} --extra rethinkdb --extra warcprox --extra yt-dlp + pip install .[rethinkdb,warcprox,yt-dlp,psycopg] + # setuptools required by rethinkdb==2.4.9 + pip install pytest setuptools shell: bash diff --git a/brozzler/worker.py b/brozzler/worker.py index 983f89d..7e2b254 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -39,7 +39,6 @@ from urllib3.exceptions import ProxyError, TimeoutError import brozzler import brozzler.browser from brozzler.model import VideoCaptureOptions -from brozzler.ydl import VideoDataClient from . import metrics diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a5541f0..685e1e5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -24,12 +24,14 @@ import tempfile import threading import time import urllib.request -from typing import List +from typing import Any, List, Optional import doublethink +import psycopg import structlog import urlcanon import yt_dlp +from psycopg_pool import ConnectionPool, PoolTimeout from yt_dlp.utils import ExtractorError, match_filter_func import brozzler @@ -38,7 +40,6 @@ from . import metrics thread_local = threading.local() - PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 @@ -52,26 +53,29 @@ logger = structlog.get_logger(logger_name=__name__) class VideoDataClient: def __init__(self): if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - import psycopg - from psycopg_pool import ConnectionPool - pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) pool.wait() logger.info("pg pool ready") - atexit.register(pool.close) + # atexit.register(pool.close) self.pool = pool def _execute_pg_query( self, query: str, row_factory=None, fetchone=False, fetchall=False ) -> Optional[Any]: - with self.pool.connection() as conn: - with conn.cursor(row_factory=row_factory) as cur: - cur.execute(query) - if fetchone: - return cur.fetchone() - if fetchall: - return cur.fetchall() + try: + with self.pool.connection() as conn: + with conn.cursor(row_factory=row_factory) as cur: + cur.execute(query) + if fetchone: + return cur.fetchone() + if fetchall: + return cur.fetchall() + except PoolTimeout as e: + logger.warn("hit PoolTimeout: %s", e) + self.pool.check() + except Exception as e: + logger.warn("postgres query failed: %s", e) return None def get_pg_video_captures(self, site=None, source=None) -> List[str]: @@ -79,7 +83,7 @@ class VideoDataClient: seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None # TODO: generalize, maybe make variable? - containing_page_timestamp_pattern = "2025%" # for future pre-dup additions + # containing_page_timestamp_pattern = "2025%" # for future pre-dup additions if source == "youtube": containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" diff --git a/pyproject.toml b/pyproject.toml index e8ac6af..4d7e0c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "brozzler" -version = "1.7.0" +version = "1.7.1" authors = [ { name="Noah Levitt", email="nlevitt@archive.org" }, ] @@ -40,7 +40,7 @@ license = "Apache-2.0" [project.optional-dependencies] yt-dlp = ["yt-dlp[default,curl-cffi]>=2024.7.25"] -psycopg = ["psycopg[binary]>=3.2.6"] +psycopg = ["psycopg[binary,pool]>=3.2.6"] dashboard = ["flask>=1.0", "gunicorn>=19.8.1"] warcprox = ["warcprox>=2.4.31"] rethinkdb = [ From b4b950c0fca1b015a580de543e505d0adfd51dab Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 30 Jun 2025 17:27:14 -0700 Subject: [PATCH 34/43] self._video_data in worker --- brozzler/worker.py | 5 +++++ brozzler/ydl.py | 21 ++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 7e2b254..fa4e7e2 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -21,6 +21,7 @@ limitations under the License. import datetime import io import json +import os import socket import threading import time @@ -39,6 +40,7 @@ from urllib3.exceptions import ProxyError, TimeoutError import brozzler import brozzler.browser from brozzler.model import VideoCaptureOptions +from brozzler.ydl import VideoDataClient from . import metrics @@ -56,6 +58,7 @@ class BrozzlerWorker: SITE_SESSION_MINUTES = 15 HEADER_REQUEST_TIMEOUT = 30 FETCH_URL_TIMEOUT = 60 + VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") def __init__( self, @@ -88,6 +91,8 @@ class BrozzlerWorker: self._service_registry = service_registry self._ytdlp_proxy_endpoints = ytdlp_proxy_endpoints self._max_browsers = max_browsers + if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): + self._video_data = VideoDataClient() self._warcprox_auto = warcprox_auto self._proxy = proxy diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 685e1e5..f256831 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -51,14 +51,16 @@ logger = structlog.get_logger(logger_name=__name__) class VideoDataClient: - def __init__(self): - if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) - pool.wait() - logger.info("pg pool ready") - # atexit.register(pool.close) + import psycopg + from psycopg_pool import ConnectionPool, PoolTimeout - self.pool = pool + def __init__(self): + pool = ConnectionPool(VIDEO_DATA_SOURCE, min_size=1, max_size=9) + pool.wait() + logger.info("pg pool ready") + # atexit.register(pool.close) + + self.pool = pool def _execute_pg_query( self, query: str, row_factory=None, fetchone=False, fetchall=False @@ -512,10 +514,11 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): or ie_result.get("extractor") == "youtube:tab" ): if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): - video_data = VideoDataClient() captured_youtube_watch_pages = set() captured_youtube_watch_pages.add( - video_data.get_pg_video_captures_by_source(site, source="youtube") + worker._video_data.get_pg_video_captures_by_source( + site, source="youtube" + ) ) uncaptured_youtube_watch_pages = [] for e in ie_result.get("entries_no_dl", []): From 43151027f3fb25aaf0543af8dd85b089bf3038e6 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 30 Jun 2025 20:26:48 -0700 Subject: [PATCH 35/43] dataclass VideoDataRecord --- brozzler/ydl.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index f256831..e1043a5 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -31,6 +31,7 @@ import psycopg import structlog import urlcanon import yt_dlp +from dataclasses import dataclass from psycopg_pool import ConnectionPool, PoolTimeout from yt_dlp.utils import ExtractorError, match_filter_func @@ -49,6 +50,26 @@ VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") logger = structlog.get_logger(logger_name=__name__) +@dataclass(frozen=True) +class VideoDataRecord: + crawl_job_id: int + is_test_crawl: bool + seed_id: int + collection_id: int + containing_page_timestamp: str + containing_page_digest: str + containing_page_media_index: int + containing_page_media_count: int + video_digest: str + video_timestamp: str + video_mimetype: str + video_http_status: int + video_size: int + containing_page_url: str + video_url: str + video_title: str + video_source_id: str + class VideoDataClient: import psycopg @@ -114,6 +135,9 @@ class VideoDataClient: results = [] return results + def create_video_capture_record(self, video_capture_record): + + def isyoutubehost(url): # split 1 splits scheme from url, split 2 splits path from hostname, split 3 splits query string on hostname From 2fc30b029c27e7f2d7490099f23598cb77c79c86 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 30 Jun 2025 20:29:28 -0700 Subject: [PATCH 36/43] dataclass VideoCaptureRecord instead --- brozzler/ydl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index e1043a5..a196377 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -51,7 +51,7 @@ VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") logger = structlog.get_logger(logger_name=__name__) @dataclass(frozen=True) -class VideoDataRecord: +class VideoCaptureRecord: crawl_job_id: int is_test_crawl: bool seed_id: int From 2422e9da0414a607efb25e3bd71d121ddc52066d Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 30 Jun 2025 20:49:37 -0700 Subject: [PATCH 37/43] def create_video_capture_record minimally --- brozzler/ydl.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index a196377..83ff378 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -50,6 +50,7 @@ VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") logger = structlog.get_logger(logger_name=__name__) +# video_title and video_source_id are new fields, from yt-dlp metadata @dataclass(frozen=True) class VideoCaptureRecord: crawl_job_id: int @@ -136,7 +137,8 @@ class VideoDataClient: return results def create_video_capture_record(self, video_capture_record): - + # to be implemented + pass def isyoutubehost(url): From d4e0aa67ec25af756bac823e79faab92d445d38c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 1 Jul 2025 14:53:59 -0700 Subject: [PATCH 38/43] more new fields --- brozzler/ydl.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 83ff378..c330c84 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -50,7 +50,8 @@ VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") logger = structlog.get_logger(logger_name=__name__) -# video_title and video_source_id are new fields, from yt-dlp metadata + +# video_title, video_display_id, video_resolution, video_capture_status are new fields, mostly from yt-dlp metadata @dataclass(frozen=True) class VideoCaptureRecord: crawl_job_id: int @@ -69,7 +70,11 @@ class VideoCaptureRecord: containing_page_url: str video_url: str video_title: str - video_source_id: str + video_display_id: ( + str # aka yt-dlp metadata as display_id, e.g., youtube watch page v param + ) + video_resolution: str + video_capture_status: str # recrawl? what else? class VideoDataClient: From 825de5f728ec772a21cf015456d5eb15749fac45 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 1 Jul 2025 18:23:48 -0700 Subject: [PATCH 39/43] worker._video_data and seed_id mostly --- brozzler/ydl.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index c330c84..ca45c40 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -45,8 +45,6 @@ PROXY_ATTEMPTS = 4 YTDLP_WAIT = 10 YTDLP_MAX_REDIRECTS = 5 -VIDEO_DATA_SOURCE = os.getenv("VIDEO_DATA_SOURCE") - logger = structlog.get_logger(logger_name=__name__) @@ -107,9 +105,9 @@ class VideoDataClient: logger.warn("postgres query failed: %s", e) return None - def get_pg_video_captures(self, site=None, source=None) -> List[str]: + def get_video_captures(self, site=None, source=None) -> List[str]: account_id = site.account_id if site.account_id else None - seed = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None + seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None # TODO: generalize, maybe make variable? # containing_page_timestamp_pattern = "2025%" # for future pre-dup additions @@ -118,20 +116,18 @@ class VideoDataClient: containing_page_url_pattern = "http://youtube.com/watch%" # yes, video data canonicalization uses "http" # support other media sources here - if account_id and seed and source: + if account_id and seed_id and source: pg_query = ( - "SELECT distinct(containing_page_url) from video where account_id = %s and seed = %s and containing_page_url like %s", + "SELECT containing_page_url from video where account_id = %s and seed_id = %s and containing_page_url like %s", ( account_id, - seed, + seed_id, containing_page_url_pattern, ), ) - elif seed and source: # account_id should usually be present - pg_query = ( - "SELECT distinct(containing_page_url) from video where seed = %s and containing_page_url like %s", - (seed, containing_page_url_pattern), - ) + else: + logger.warn("missing account_id, seed_id, or source") + results = [] try: results = self._execute_query( pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True @@ -142,7 +138,7 @@ class VideoDataClient: return results def create_video_capture_record(self, video_capture_record): - # to be implemented + # note: brozzler postcrawl step 72 includes info from crawl-log pass @@ -539,15 +535,16 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): logger.info("tempdir for yt-dlp", tempdir=tempdir) ydl = _build_youtube_dl(worker, tempdir, site, page, ytdlp_proxy_endpoints) ie_result = _try_youtube_dl(worker, ydl, site, page) + # print(ie_result) outlinks = set() if ie_result and ( ie_result.get("extractor") == "youtube:playlist" or ie_result.get("extractor") == "youtube:tab" ): - if VIDEO_DATA_SOURCE and VIDEO_DATA_SOURCE.startswith("postgresql"): + if worker._video_data: captured_youtube_watch_pages = set() captured_youtube_watch_pages.add( - worker._video_data.get_pg_video_captures_by_source( + worker._video_data.get_video_captures_by_source( site, source="youtube" ) ) From 701707c7ceb8b792afaa7ae5da2cc1af3496879e Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 1 Jul 2025 23:47:51 -0700 Subject: [PATCH 40/43] save video_record here? --- brozzler/ydl.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ca45c40..4130f13 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -408,9 +408,9 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): return ydl -def _remember_videos(page, pushed_videos=None): +def _remember_videos(page, worker, info_json=None, pushed_videos=None): """ - Saves info about videos captured by yt-dlp in `page.videos`. + Saves info about videos captured by yt-dlp in `page.videos` and postgres. """ if "videos" not in page: page.videos = [] @@ -422,6 +422,29 @@ def _remember_videos(page, pushed_videos=None): "content-type": pushed_video["content-type"], "content-length": pushed_video["content-length"], } + # grab info from info_json if it's available + video_record = worker._video_data.VideoCaptureRecord() + video_record.crawl_job_id = None + video_record.is_test_crawl = None + video_record.seed_id = None + video_record.collection_id = None + video_record.containing_page_timestamp = None + video_record.containing_page_digest = None + video_record.containing_page_media_index = None + video_record.containing_page_media_count = None + video_record.video_digest = None + video_record.video_timestamp = None + video_record.video_mimetype = pushed_video["content-type"] + video_record.video_http_status = pushed_video["response_code"] + video_record.video_size = None + video_record.containing_page_url = None + video_record.video_url = pushed_video["url"] + video_record.video_title = None + video_record.video_display_id = None + video_record.video_resolution = None + video_record.video_capture_status = None # "recrawl" + worker._video_data.save_video_capture_record(video_record) + logger.debug("embedded video", video=video) page.videos.append(video) @@ -496,9 +519,9 @@ def _try_youtube_dl(worker, ydl, site, page): logger.info("ytdlp completed successfully") - _remember_videos(page, ydl.pushed_videos) + info_json = json.dumps(ie_result, sort_keys=True, indent=4) + _remember_videos(page, info_json, ydl.pushed_videos) if worker._using_warcprox(site): - info_json = json.dumps(ie_result, sort_keys=True, indent=4) logger.info( "sending WARCPROX_WRITE_RECORD request to warcprox with yt-dlp json", url=ydl.url, From 3c1328ff533da2a1d4971ee5044ac9aa9f02c9ea Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 2 Jul 2025 17:06:41 -0700 Subject: [PATCH 41/43] save_video_capture_record --- brozzler/ydl.py | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 4130f13..6e2f3e9 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -138,8 +138,20 @@ class VideoDataClient: return results def create_video_capture_record(self, video_capture_record): - # note: brozzler postcrawl step 72 includes info from crawl-log - pass + # note: brozzler postcrawl step 72 includes info from crawl-log — watch for differences! + # TODO: needs added fields added to postgres table, refinement + pg_query = ( + f"INSERT INTO video ({VideoCaptureRecord - items}) VALUES (%s, %s, ...)", + VideoCaptureRecord - values, + ) + try: + results = self._execute_query( + pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True + ) + except Exception as e: + logger.warn("postgres query failed: %s", e) + results = [] + return results def isyoutubehost(url): @@ -408,7 +420,7 @@ def _build_youtube_dl(worker, destdir, site, page, ytdlp_proxy_endpoints): return ydl -def _remember_videos(page, worker, info_json=None, pushed_videos=None): +def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None): """ Saves info about videos captured by yt-dlp in `page.videos` and postgres. """ @@ -422,12 +434,14 @@ def _remember_videos(page, worker, info_json=None, pushed_videos=None): "content-type": pushed_video["content-type"], "content-length": pushed_video["content-length"], } - # grab info from info_json if it's available + + warc_prefix_items = site.warcprox_meta["warc-prefix"].split("-") + video_record = worker._video_data.VideoCaptureRecord() - video_record.crawl_job_id = None - video_record.is_test_crawl = None - video_record.seed_id = None - video_record.collection_id = None + video_record.crawl_job_id = site.job_id + video_record.is_test_crawl = True if warc_prefix_items[2] == "TEST" else False + video_record.seed_id = site.ait_seed_id + video_record.collection_id = int(warc_prefix_items[1]) video_record.containing_page_timestamp = None video_record.containing_page_digest = None video_record.containing_page_media_index = None @@ -436,13 +450,17 @@ def _remember_videos(page, worker, info_json=None, pushed_videos=None): video_record.video_timestamp = None video_record.video_mimetype = pushed_video["content-type"] video_record.video_http_status = pushed_video["response_code"] - video_record.video_size = None - video_record.containing_page_url = None + video_record.video_size = pushed_video["content-length"] # probably? + video_record.containing_page_url = str( + urlcanon.aggressive(ydl.url) + ) # probably? video_record.video_url = pushed_video["url"] - video_record.video_title = None - video_record.video_display_id = None - video_record.video_resolution = None - video_record.video_capture_status = None # "recrawl" + # note: ie_result may not be correct when multiple videos present + video_record.video_title = ie_result.get("title") + video_record.video_display_id = ie_result.get("display_id") + video_record.video_resolution = ie_result.get("resolution") + video_record.video_capture_status = None # "recrawl" maybe + worker._video_data.save_video_capture_record(video_record) logger.debug("embedded video", video=video) From bc987b2a2737cd8fdbba17001593a513802528fc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 14 Jul 2025 17:07:33 -0700 Subject: [PATCH 42/43] add get_recent_video_capture, mostly --- brozzler/ydl.py | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 6e2f3e9..ee837ad 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -24,7 +24,7 @@ import tempfile import threading import time import urllib.request -from typing import Any, List, Optional +from typing import Any, Bool, List, Optional import doublethink import psycopg @@ -105,6 +105,33 @@ class VideoDataClient: logger.warn("postgres query failed: %s", e) return None + def get_recent_video_capture(self, site=None, containing_page_url=None) -> List: + account_id = site.account_id if site.account_id else None + seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None + + if account_id and seed_id and containing_page_url: + # check for postgres query for most recent record + pg_query = ( + "SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s LIMIT 1", + ( + account_id, + seed_id, + str(urlcanon.aggressive(containing_page_url)) + ) + ) + try: + results = self._execute_query( + pg_query, fetchall=True + ) + except Exception as e: + logger.warn("postgres query failed: %s", e) + results = [] + else: + logger.warn("missing account_id, seed_id, or containing_page_url") + results = [] + + return results + def get_video_captures(self, site=None, source=None) -> List[str]: account_id = site.account_id if site.account_id else None seed_id = site.metadata.ait_seed_id if site.metadata.ait_seed_id else None @@ -125,16 +152,17 @@ class VideoDataClient: containing_page_url_pattern, ), ) + try: + results = self._execute_query( + pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True + ) + except Exception as e: + logger.warn("postgres query failed: %s", e) + results = [] else: logger.warn("missing account_id, seed_id, or source") results = [] - try: - results = self._execute_query( - pg_query, row_factory=psycopg.rows.scalar_row, fetchall=True - ) - except Exception as e: - logger.warn("postgres query failed: %s", e) - results = [] + return results def create_video_capture_record(self, video_capture_record): @@ -591,7 +619,7 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): ) uncaptured_youtube_watch_pages = [] for e in ie_result.get("entries_no_dl", []): - youtube_watch_url = f"https://www.youtube.com/watch?v={e['id']}" + youtube_watch_url = str(urlcanon.aggressive(f"http://www.youtube.com/watch?v={e['id']}")) if youtube_watch_url in captured_youtube_watch_pages: continue uncaptured_youtube_watch_pages.append(youtube_watch_url) From 3af824727771c213ea62df5c42a405990dd8f643 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 15 Jul 2025 14:42:53 -0700 Subject: [PATCH 43/43] updates for QA deploy --- brozzler/ydl.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/brozzler/ydl.py b/brozzler/ydl.py index ee837ad..ab1ccb2 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -113,16 +113,10 @@ class VideoDataClient: # check for postgres query for most recent record pg_query = ( "SELECT * from video where account_id = %s and seed_id = %s and containing_page_url = %s LIMIT 1", - ( - account_id, - seed_id, - str(urlcanon.aggressive(containing_page_url)) - ) + (account_id, seed_id, str(urlcanon.aggressive(containing_page_url))), ) try: - results = self._execute_query( - pg_query, fetchall=True - ) + results = self._execute_query(pg_query, fetchall=True) except Exception as e: logger.warn("postgres query failed: %s", e) results = [] @@ -462,7 +456,8 @@ def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None): "content-type": pushed_video["content-type"], "content-length": pushed_video["content-length"], } - + """ + # WIP: add new video record to QA postgres here, or in postcrawl only? warc_prefix_items = site.warcprox_meta["warc-prefix"].split("-") video_record = worker._video_data.VideoCaptureRecord() @@ -490,7 +485,7 @@ def _remember_videos(page, site, worker, ydl, ie_result, pushed_videos=None): video_record.video_capture_status = None # "recrawl" maybe worker._video_data.save_video_capture_record(video_record) - + """ logger.debug("embedded video", video=video) page.videos.append(video) @@ -619,10 +614,15 @@ def do_youtube_dl(worker, site, page, ytdlp_proxy_endpoints): ) uncaptured_youtube_watch_pages = [] for e in ie_result.get("entries_no_dl", []): - youtube_watch_url = str(urlcanon.aggressive(f"http://www.youtube.com/watch?v={e['id']}")) + # note: http needed for match + youtube_watch_url = str( + urlcanon.aggressive(f"http://www.youtube.com/watch?v={e['id']}") + ) if youtube_watch_url in captured_youtube_watch_pages: continue - uncaptured_youtube_watch_pages.append(youtube_watch_url) + uncaptured_youtube_watch_pages.append( + f"https://www.youtube.com/watch?v={e['id']}" + ) if uncaptured_youtube_watch_pages: outlinks.add(uncaptured_youtube_watch_pages) # todo: handle outlinks for instagram and soundcloud, other media source, here (if anywhere)