mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-07 14:02:24 -04:00
fix needs_browsing check
correctly handle relative url "location" response header
This commit is contained in:
parent
bf5401283e
commit
ba8d5a3740
2 changed files with 5 additions and 2 deletions
|
@ -37,6 +37,7 @@ import urlcanon
|
||||||
from requests.structures import CaseInsensitiveDict
|
from requests.structures import CaseInsensitiveDict
|
||||||
import rethinkdb as r
|
import rethinkdb as r
|
||||||
import datetime
|
import datetime
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
|
@ -87,7 +88,9 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||||
|
|
||||||
final_url = url
|
final_url = url
|
||||||
while final_url in redirects:
|
while final_url in redirects:
|
||||||
final_url = redirects.pop(final_url)['response_headers']['location']
|
txn = redirects.pop(final_url)
|
||||||
|
final_url = urllib.parse.urljoin(
|
||||||
|
txn['url'], txn['response_headers']['location'])
|
||||||
|
|
||||||
final_bounces = []
|
final_bounces = []
|
||||||
for txn in self.transactions:
|
for txn in self.transactions:
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b12.dev280',
|
version='1.1b12.dev281',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue